1 /*
2 * Copyright (c) 2021 Loongson Technology Corporation Limited
3 * All rights reserved.
4 * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
5 * Xiwei Gu <guxiwei-hf@loongson.cn>
6 * Lu Wang <wanglu@loongson.cn>
7 *
8 * This file is a header file for loongarch builtin extension.
9 *
10 */
11
12 #ifndef LOONGSON_INTRINSICS_H
13 #define LOONGSON_INTRINSICS_H
14
15 /**
16 * MAJOR version: Macro usage changes.
17 * MINOR version: Add new functions, or bug fixes.
18 * MICRO version: Comment changes or implementation changes.
19 */
20 #define LSOM_VERSION_MAJOR 1
21 #define LSOM_VERSION_MINOR 1
22 #define LSOM_VERSION_MICRO 0
23
24 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
25 { \
26 _OUT0 = _INS(_IN0); \
27 _OUT1 = _INS(_IN1); \
28 }
29
30 #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
31 { \
32 _OUT0 = _INS(_IN0, _IN1); \
33 _OUT1 = _INS(_IN2, _IN3); \
34 }
35
36 #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
37 { \
38 _OUT0 = _INS(_IN0, _IN1, _IN2); \
39 _OUT1 = _INS(_IN3, _IN4, _IN5); \
40 }
41
42 #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
43 { \
44 DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
45 DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
46 }
47
48 #define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
49 _OUT1, _OUT2, _OUT3) \
50 { \
51 DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
52 DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
53 }
54
55 #define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
56 _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
57 { \
58 DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
59 DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
60 }
61
62 #ifdef __loongarch_sx
63 #include <lsxintrin.h>
64 /*
65 * =============================================================================
66 * Description : Dot product & addition of byte vector elements
67 * Arguments : Inputs - in_c, in_h, in_l
68 * Outputs - out
69 * Return Type - halfword
70 * Details : Signed byte elements from in_h are multiplied by
71 * signed byte elements from in_l, and then added adjacent to
72 * each other to get results with the twice size of input.
73 * Then the results plus to signed half-word elements from in_c.
74 * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
75 * in_c : 1,2,3,4, 1,2,3,4
76 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
77 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
78 * out : 23,40,41,26, 23,40,41,26
79 * =============================================================================
80 */
__lsx_vdp2add_h_b(__m128i in_c,__m128i in_h,__m128i in_l)81 static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
82 __m128i in_l) {
83 __m128i out;
84
85 out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
86 out = __lsx_vmaddwod_h_b(out, in_h, in_l);
87 return out;
88 }
89
90 /*
91 * =============================================================================
92 * Description : Dot product & addition of byte vector elements
93 * Arguments : Inputs - in_c, in_h, in_l
94 * Outputs - out
95 * Return Type - halfword
96 * Details : Unsigned byte elements from in_h are multiplied by
97 * unsigned byte elements from in_l, and then added adjacent to
98 * each other to get results with the twice size of input.
99 * The results plus to signed half-word elements from in_c.
100 * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
101 * in_c : 1,2,3,4, 1,2,3,4
102 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
103 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
104 * out : 23,40,41,26, 23,40,41,26
105 * =============================================================================
106 */
__lsx_vdp2add_h_bu(__m128i in_c,__m128i in_h,__m128i in_l)107 static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
108 __m128i in_l) {
109 __m128i out;
110
111 out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
112 out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
113 return out;
114 }
115
116 /*
117 * =============================================================================
118 * Description : Dot product & addition of byte vector elements
119 * Arguments : Inputs - in_c, in_h, in_l
120 * Outputs - out
121 * Return Type - halfword
122 * Details : Unsigned byte elements from in_h are multiplied by
123 * signed byte elements from in_l, and then added adjacent to
124 * each other to get results with the twice size of input.
125 * The results plus to signed half-word elements from in_c.
126 * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
127 * in_c : 1,1,1,1, 1,1,1,1
128 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
129 * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
130 * out : -4,-24,-60,-112, 6,26,62,114
131 * =============================================================================
132 */
__lsx_vdp2add_h_bu_b(__m128i in_c,__m128i in_h,__m128i in_l)133 static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
134 __m128i in_l) {
135 __m128i out;
136
137 out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
138 out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
139 return out;
140 }
141
142 /*
143 * =============================================================================
144 * Description : Dot product & addition of half-word vector elements
145 * Arguments : Inputs - in_c, in_h, in_l
146 * Outputs - out
147 * Return Type - __m128i
148 * Details : Signed half-word elements from in_h are multiplied by
149 * signed half-word elements from in_l, and then added adjacent to
150 * each other to get results with the twice size of input.
151 * Then the results plus to signed word elements from in_c.
152 * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
153 * in_c : 1,2,3,4
154 * in_h : 1,2,3,4, 5,6,7,8
155 * in_l : 8,7,6,5, 4,3,2,1
156 * out : 23,40,41,26
157 * =============================================================================
158 */
__lsx_vdp2add_w_h(__m128i in_c,__m128i in_h,__m128i in_l)159 static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
160 __m128i in_l) {
161 __m128i out;
162
163 out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
164 out = __lsx_vmaddwod_w_h(out, in_h, in_l);
165 return out;
166 }
167
168 /*
169 * =============================================================================
170 * Description : Dot product of byte vector elements
171 * Arguments : Inputs - in_h, in_l
172 * Outputs - out
173 * Return Type - halfword
174 * Details : Signed byte elements from in_h are multiplied by
175 * signed byte elements from in_l, and then added adjacent to
176 * each other to get results with the twice size of input.
177 * Example : out = __lsx_vdp2_h_b(in_h, in_l)
178 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
179 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
180 * out : 22,38,38,22, 22,38,38,22
181 * =============================================================================
182 */
__lsx_vdp2_h_b(__m128i in_h,__m128i in_l)183 static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
184 __m128i out;
185
186 out = __lsx_vmulwev_h_b(in_h, in_l);
187 out = __lsx_vmaddwod_h_b(out, in_h, in_l);
188 return out;
189 }
190
191 /*
192 * =============================================================================
193 * Description : Dot product of byte vector elements
194 * Arguments : Inputs - in_h, in_l
195 * Outputs - out
196 * Return Type - halfword
197 * Details : Unsigned byte elements from in_h are multiplied by
198 * unsigned byte elements from in_l, and then added adjacent to
199 * each other to get results with the twice size of input.
200 * Example : out = __lsx_vdp2_h_bu(in_h, in_l)
201 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
202 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
203 * out : 22,38,38,22, 22,38,38,22
204 * =============================================================================
205 */
__lsx_vdp2_h_bu(__m128i in_h,__m128i in_l)206 static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
207 __m128i out;
208
209 out = __lsx_vmulwev_h_bu(in_h, in_l);
210 out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
211 return out;
212 }
213
214 /*
215 * =============================================================================
216 * Description : Dot product of byte vector elements
217 * Arguments : Inputs - in_h, in_l
218 * Outputs - out
219 * Return Type - halfword
220 * Details : Unsigned byte elements from in_h are multiplied by
221 * signed byte elements from in_l, and then added adjacent to
222 * each other to get results with the twice size of input.
223 * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l)
224 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
225 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
226 * out : 22,38,38,22, 22,38,38,6
227 * =============================================================================
228 */
__lsx_vdp2_h_bu_b(__m128i in_h,__m128i in_l)229 static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
230 __m128i out;
231
232 out = __lsx_vmulwev_h_bu_b(in_h, in_l);
233 out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
234 return out;
235 }
236
237 /*
238 * =============================================================================
239 * Description : Dot product of byte vector elements
240 * Arguments : Inputs - in_h, in_l
241 * Outputs - out
242 * Return Type - halfword
243 * Details : Signed byte elements from in_h are multiplied by
244 * signed byte elements from in_l, and then added adjacent to
245 * each other to get results with the twice size of input.
246 * Example : out = __lsx_vdp2_w_h(in_h, in_l)
247 * in_h : 1,2,3,4, 5,6,7,8
248 * in_l : 8,7,6,5, 4,3,2,1
249 * out : 22,38,38,22
250 * =============================================================================
251 */
__lsx_vdp2_w_h(__m128i in_h,__m128i in_l)252 static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
253 __m128i out;
254
255 out = __lsx_vmulwev_w_h(in_h, in_l);
256 out = __lsx_vmaddwod_w_h(out, in_h, in_l);
257 return out;
258 }
259
260 /*
261 * =============================================================================
262 * Description : Clip all halfword elements of input vector between min & max
263 * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
264 * (_in))
265 * Arguments : Inputs - _in (input vector)
266 * - min (min threshold)
267 * - max (max threshold)
268 * Outputs - out (output vector with clipped elements)
269 * Return Type - signed halfword
270 * Example : out = __lsx_vclip_h(_in)
271 * _in : -8,2,280,249, -8,255,280,249
272 * min : 1,1,1,1, 1,1,1,1
273 * max : 9,9,9,9, 9,9,9,9
274 * out : 1,2,9,9, 1,9,9,9
275 * =============================================================================
276 */
__lsx_vclip_h(__m128i _in,__m128i min,__m128i max)277 static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
278 __m128i out;
279
280 out = __lsx_vmax_h(min, _in);
281 out = __lsx_vmin_h(max, out);
282 return out;
283 }
284
285 /*
286 * =============================================================================
287 * Description : Set each element of vector between 0 and 255
288 * Arguments : Inputs - _in
289 * Outputs - out
290 * Return Type - halfword
291 * Details : Signed byte elements from _in are clamped between 0 and 255.
292 * Example : out = __lsx_vclip255_h(_in)
293 * _in : -8,255,280,249, -8,255,280,249
294 * out : 0,255,255,249, 0,255,255,249
295 * =============================================================================
296 */
__lsx_vclip255_h(__m128i _in)297 static inline __m128i __lsx_vclip255_h(__m128i _in) {
298 __m128i out;
299
300 out = __lsx_vmaxi_h(_in, 0);
301 out = __lsx_vsat_hu(out, 7);
302 return out;
303 }
304
305 /*
306 * =============================================================================
307 * Description : Set each element of vector between 0 and 255
308 * Arguments : Inputs - _in
309 * Outputs - out
310 * Return Type - word
311 * Details : Signed byte elements from _in are clamped between 0 and 255.
312 * Example : out = __lsx_vclip255_w(_in)
313 * _in : -8,255,280,249
314 * out : 0,255,255,249
315 * =============================================================================
316 */
__lsx_vclip255_w(__m128i _in)317 static inline __m128i __lsx_vclip255_w(__m128i _in) {
318 __m128i out;
319
320 out = __lsx_vmaxi_w(_in, 0);
321 out = __lsx_vsat_wu(out, 7);
322 return out;
323 }
324
325 /*
326 * =============================================================================
327 * Description : Swap two variables
328 * Arguments : Inputs - _in0, _in1
329 * Outputs - _in0, _in1 (in-place)
330 * Details : Swapping of two input variables using xor
331 * Example : LSX_SWAP(_in0, _in1)
332 * _in0 : 1,2,3,4
333 * _in1 : 5,6,7,8
334 * _in0(out) : 5,6,7,8
335 * _in1(out) : 1,2,3,4
336 * =============================================================================
337 */
338 #define LSX_SWAP(_in0, _in1) \
339 { \
340 _in0 = __lsx_vxor_v(_in0, _in1); \
341 _in1 = __lsx_vxor_v(_in0, _in1); \
342 _in0 = __lsx_vxor_v(_in0, _in1); \
343 }
344
345 /*
346 * =============================================================================
347 * Description : Transpose 4x4 block with word elements in vectors
348 * Arguments : Inputs - in0, in1, in2, in3
349 * Outputs - out0, out1, out2, out3
350 * Details :
351 * Example :
352 * 1, 2, 3, 4 1, 5, 9,13
353 * 5, 6, 7, 8 to 2, 6,10,14
354 * 9,10,11,12 =====> 3, 7,11,15
355 * 13,14,15,16 4, 8,12,16
356 * =============================================================================
357 */
358 #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
359 { \
360 __m128i _t0, _t1, _t2, _t3; \
361 \
362 _t0 = __lsx_vilvl_w(_in1, _in0); \
363 _t1 = __lsx_vilvh_w(_in1, _in0); \
364 _t2 = __lsx_vilvl_w(_in3, _in2); \
365 _t3 = __lsx_vilvh_w(_in3, _in2); \
366 _out0 = __lsx_vilvl_d(_t2, _t0); \
367 _out1 = __lsx_vilvh_d(_t2, _t0); \
368 _out2 = __lsx_vilvl_d(_t3, _t1); \
369 _out3 = __lsx_vilvh_d(_t3, _t1); \
370 }
371
372 /*
373 * =============================================================================
374 * Description : Transpose 8x8 block with byte elements in vectors
375 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
376 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
377 * _out7
378 * Details : The rows of the matrix become columns, and the columns
379 * become rows.
380 * Example : LSX_TRANSPOSE8x8_B
381 * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
382 * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
383 * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
384 * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
385 * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
386 * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
387 * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
388 * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
389 *
390 * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
391 * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
392 * _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
393 * _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
394 * _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
395 * _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
396 * _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
397 * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
398 * =============================================================================
399 */
400 #define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
401 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
402 _out7) \
403 { \
404 __m128i zero = { 0 }; \
405 __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \
406 __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
407 \
408 _t0 = __lsx_vilvl_b(_in2, _in0); \
409 _t1 = __lsx_vilvl_b(_in3, _in1); \
410 _t2 = __lsx_vilvl_b(_in6, _in4); \
411 _t3 = __lsx_vilvl_b(_in7, _in5); \
412 _t4 = __lsx_vilvl_b(_t1, _t0); \
413 _t5 = __lsx_vilvh_b(_t1, _t0); \
414 _t6 = __lsx_vilvl_b(_t3, _t2); \
415 _t7 = __lsx_vilvh_b(_t3, _t2); \
416 _out0 = __lsx_vilvl_w(_t6, _t4); \
417 _out2 = __lsx_vilvh_w(_t6, _t4); \
418 _out4 = __lsx_vilvl_w(_t7, _t5); \
419 _out6 = __lsx_vilvh_w(_t7, _t5); \
420 _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
421 _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
422 _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
423 _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
424 }
425
426 /*
427 * =============================================================================
428 * Description : Transpose 8x8 block with half-word elements in vectors
429 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
430 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
431 * Details :
432 * Example :
433 * 00,01,02,03,04,05,06,07 00,10,20,30,40,50,60,70
434 * 10,11,12,13,14,15,16,17 01,11,21,31,41,51,61,71
435 * 20,21,22,23,24,25,26,27 02,12,22,32,42,52,62,72
436 * 30,31,32,33,34,35,36,37 to 03,13,23,33,43,53,63,73
437 * 40,41,42,43,44,45,46,47 ======> 04,14,24,34,44,54,64,74
438 * 50,51,52,53,54,55,56,57 05,15,25,35,45,55,65,75
439 * 60,61,62,63,64,65,66,67 06,16,26,36,46,56,66,76
440 * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77
441 * =============================================================================
442 */
443 #define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
444 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
445 _out7) \
446 { \
447 __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
448 \
449 _s0 = __lsx_vilvl_h(_in6, _in4); \
450 _s1 = __lsx_vilvl_h(_in7, _in5); \
451 _t0 = __lsx_vilvl_h(_s1, _s0); \
452 _t1 = __lsx_vilvh_h(_s1, _s0); \
453 _s0 = __lsx_vilvh_h(_in6, _in4); \
454 _s1 = __lsx_vilvh_h(_in7, _in5); \
455 _t2 = __lsx_vilvl_h(_s1, _s0); \
456 _t3 = __lsx_vilvh_h(_s1, _s0); \
457 _s0 = __lsx_vilvl_h(_in2, _in0); \
458 _s1 = __lsx_vilvl_h(_in3, _in1); \
459 _t4 = __lsx_vilvl_h(_s1, _s0); \
460 _t5 = __lsx_vilvh_h(_s1, _s0); \
461 _s0 = __lsx_vilvh_h(_in2, _in0); \
462 _s1 = __lsx_vilvh_h(_in3, _in1); \
463 _t6 = __lsx_vilvl_h(_s1, _s0); \
464 _t7 = __lsx_vilvh_h(_s1, _s0); \
465 \
466 _out0 = __lsx_vpickev_d(_t0, _t4); \
467 _out2 = __lsx_vpickev_d(_t1, _t5); \
468 _out4 = __lsx_vpickev_d(_t2, _t6); \
469 _out6 = __lsx_vpickev_d(_t3, _t7); \
470 _out1 = __lsx_vpickod_d(_t0, _t4); \
471 _out3 = __lsx_vpickod_d(_t1, _t5); \
472 _out5 = __lsx_vpickod_d(_t2, _t6); \
473 _out7 = __lsx_vpickod_d(_t3, _t7); \
474 }
475
476 /*
477 * =============================================================================
478 * Description : Transpose input 8x4 byte block into 4x8
479 * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block)
480 * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block)
481 * Return Type - as per RTYPE
482 * Details : The rows of the matrix become columns, and the columns become
483 * rows.
484 * Example : LSX_TRANSPOSE8x4_B
485 * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
486 * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
487 * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
488 * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
489 * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
490 * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
491 * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
492 * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
493 *
494 * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
495 * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
496 * _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
497 * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
498 * =============================================================================
499 */
500 #define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
501 _out0, _out1, _out2, _out3) \
502 { \
503 __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
504 \
505 _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
506 _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
507 _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
508 _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
509 _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
510 \
511 _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
512 _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
513 _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
514 \
515 _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
516 _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
517 _out1 = __lsx_vilvh_d(_out2, _out0); \
518 _out3 = __lsx_vilvh_d(_out0, _out2); \
519 }
520
521 /*
522 * =============================================================================
523 * Description : Transpose 16x8 block with byte elements in vectors
524 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8
525 * in9, in10, in11, in12, in13, in14, in15
526 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
527 * Details :
528 * Example :
529 * 000,001,002,003,004,005,006,007
530 * 008,009,010,011,012,013,014,015
531 * 016,017,018,019,020,021,022,023
532 * 024,025,026,027,028,029,030,031
533 * 032,033,034,035,036,037,038,039
534 * 040,041,042,043,044,045,046,047 000,008,...,112,120
535 * 048,049,050,051,052,053,054,055 001,009,...,113,121
536 * 056,057,058,059,060,061,062,063 to 002,010,...,114,122
537 * 064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
538 * 072,073,074,075,076,077,078,079 004,012,...,116,124
539 * 080,081,082,083,084,085,086,087 005,013,...,117,125
540 * 088,089,090,091,092,093,094,095 006,014,...,118,126
541 * 096,097,098,099,100,101,102,103 007,015,...,119,127
542 * 104,105,106,107,108,109,110,111
543 * 112,113,114,115,116,117,118,119
544 * 120,121,122,123,124,125,126,127
545 * =============================================================================
546 */
547 #define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
548 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
549 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
550 _out6, _out7) \
551 { \
552 __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
553 __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
554 DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
555 _tmp0, _tmp1, _tmp2, _tmp3); \
556 DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
557 _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
558 DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
559 DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
560 DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
561 DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
562 DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
563 DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
564 DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
565 DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
566 DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
567 DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
568 DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
569 DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
570 }
571
572 /*
573 * =============================================================================
574 * Description : Butterfly of 4 input vectors
575 * Arguments : Inputs - in0, in1, in2, in3
576 * Outputs - out0, out1, out2, out3
577 * Details : Butterfly operation
578 * Example :
579 * out0 = in0 + in3;
580 * out1 = in1 + in2;
581 * out2 = in1 - in2;
582 * out3 = in0 - in3;
583 * =============================================================================
584 */
585 #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
586 { \
587 _out0 = __lsx_vadd_b(_in0, _in3); \
588 _out1 = __lsx_vadd_b(_in1, _in2); \
589 _out2 = __lsx_vsub_b(_in1, _in2); \
590 _out3 = __lsx_vsub_b(_in0, _in3); \
591 }
592 #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
593 { \
594 _out0 = __lsx_vadd_h(_in0, _in3); \
595 _out1 = __lsx_vadd_h(_in1, _in2); \
596 _out2 = __lsx_vsub_h(_in1, _in2); \
597 _out3 = __lsx_vsub_h(_in0, _in3); \
598 }
599 #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
600 { \
601 _out0 = __lsx_vadd_w(_in0, _in3); \
602 _out1 = __lsx_vadd_w(_in1, _in2); \
603 _out2 = __lsx_vsub_w(_in1, _in2); \
604 _out3 = __lsx_vsub_w(_in0, _in3); \
605 }
606 #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
607 { \
608 _out0 = __lsx_vadd_d(_in0, _in3); \
609 _out1 = __lsx_vadd_d(_in1, _in2); \
610 _out2 = __lsx_vsub_d(_in1, _in2); \
611 _out3 = __lsx_vsub_d(_in0, _in3); \
612 }
613
614 /*
615 * =============================================================================
616 * Description : Butterfly of 8 input vectors
617 * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
618 * Outputs - _out0, _out1, _out2, _out3, ~
619 * Details : Butterfly operation
620 * Example :
621 * _out0 = _in0 + _in7;
622 * _out1 = _in1 + _in6;
623 * _out2 = _in2 + _in5;
624 * _out3 = _in3 + _in4;
625 * _out4 = _in3 - _in4;
626 * _out5 = _in2 - _in5;
627 * _out6 = _in1 - _in6;
628 * _out7 = _in0 - _in7;
629 * =============================================================================
630 */
631 #define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
632 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
633 _out7) \
634 { \
635 _out0 = __lsx_vadd_b(_in0, _in7); \
636 _out1 = __lsx_vadd_b(_in1, _in6); \
637 _out2 = __lsx_vadd_b(_in2, _in5); \
638 _out3 = __lsx_vadd_b(_in3, _in4); \
639 _out4 = __lsx_vsub_b(_in3, _in4); \
640 _out5 = __lsx_vsub_b(_in2, _in5); \
641 _out6 = __lsx_vsub_b(_in1, _in6); \
642 _out7 = __lsx_vsub_b(_in0, _in7); \
643 }
644
645 #define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
646 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
647 _out7) \
648 { \
649 _out0 = __lsx_vadd_h(_in0, _in7); \
650 _out1 = __lsx_vadd_h(_in1, _in6); \
651 _out2 = __lsx_vadd_h(_in2, _in5); \
652 _out3 = __lsx_vadd_h(_in3, _in4); \
653 _out4 = __lsx_vsub_h(_in3, _in4); \
654 _out5 = __lsx_vsub_h(_in2, _in5); \
655 _out6 = __lsx_vsub_h(_in1, _in6); \
656 _out7 = __lsx_vsub_h(_in0, _in7); \
657 }
658
659 #define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
660 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
661 _out7) \
662 { \
663 _out0 = __lsx_vadd_w(_in0, _in7); \
664 _out1 = __lsx_vadd_w(_in1, _in6); \
665 _out2 = __lsx_vadd_w(_in2, _in5); \
666 _out3 = __lsx_vadd_w(_in3, _in4); \
667 _out4 = __lsx_vsub_w(_in3, _in4); \
668 _out5 = __lsx_vsub_w(_in2, _in5); \
669 _out6 = __lsx_vsub_w(_in1, _in6); \
670 _out7 = __lsx_vsub_w(_in0, _in7); \
671 }
672
673 #define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
674 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
675 _out7) \
676 { \
677 _out0 = __lsx_vadd_d(_in0, _in7); \
678 _out1 = __lsx_vadd_d(_in1, _in6); \
679 _out2 = __lsx_vadd_d(_in2, _in5); \
680 _out3 = __lsx_vadd_d(_in3, _in4); \
681 _out4 = __lsx_vsub_d(_in3, _in4); \
682 _out5 = __lsx_vsub_d(_in2, _in5); \
683 _out6 = __lsx_vsub_d(_in1, _in6); \
684 _out7 = __lsx_vsub_d(_in0, _in7); \
685 }
686
687 #endif // LSX
688
689 #ifdef __loongarch_asx
690 #include <lasxintrin.h>
691 /*
692 * =============================================================================
693 * Description : Dot product of byte vector elements
694 * Arguments : Inputs - in_h, in_l
695 * Output - out
696 * Return Type - signed halfword
697 * Details : Unsigned byte elements from in_h are multiplied with
698 * unsigned byte elements from in_l producing a result
699 * twice the size of input i.e. signed halfword.
700 * Then this multiplied results of adjacent odd-even elements
701 * are added to the out vector
702 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
703 * =============================================================================
704 */
__lasx_xvdp2_h_bu(__m256i in_h,__m256i in_l)705 static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
706 __m256i out;
707
708 out = __lasx_xvmulwev_h_bu(in_h, in_l);
709 out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
710 return out;
711 }
712
713 /*
714 * =============================================================================
715 * Description : Dot product of byte vector elements
716 * Arguments : Inputs - in_h, in_l
717 * Output - out
718 * Return Type - signed halfword
719 * Details : Signed byte elements from in_h are multiplied with
720 * signed byte elements from in_l producing a result
721 * twice the size of input i.e. signed halfword.
722 * Then this multiplication results of adjacent odd-even elements
723 * are added to the out vector
724 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
725 * =============================================================================
726 */
__lasx_xvdp2_h_b(__m256i in_h,__m256i in_l)727 static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
728 __m256i out;
729
730 out = __lasx_xvmulwev_h_b(in_h, in_l);
731 out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
732 return out;
733 }
734
735 /*
736 * =============================================================================
737 * Description : Dot product of halfword vector elements
738 * Arguments : Inputs - in_h, in_l
739 * Output - out
740 * Return Type - signed word
741 * Details : Signed halfword elements from in_h are multiplied with
742 * signed halfword elements from in_l producing a result
743 * twice the size of input i.e. signed word.
744 * Then this multiplied results of adjacent odd-even elements
745 * are added to the out vector.
746 * Example : out = __lasx_xvdp2_w_h(in_h, in_l)
747 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
748 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
749 * out : 22,38,38,22, 22,38,38,22
750 * =============================================================================
751 */
__lasx_xvdp2_w_h(__m256i in_h,__m256i in_l)752 static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
753 __m256i out;
754
755 out = __lasx_xvmulwev_w_h(in_h, in_l);
756 out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
757 return out;
758 }
759
760 /*
761 * =============================================================================
762 * Description : Dot product of word vector elements
763 * Arguments : Inputs - in_h, in_l
764 * Output - out
765 * Return Type - signed double
766 * Details : Signed word elements from in_h are multiplied with
767 * signed word elements from in_l producing a result
768 * twice the size of input i.e. signed double-word.
769 * Then this multiplied results of adjacent odd-even elements
770 * are added to the out vector.
771 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
772 * =============================================================================
773 */
__lasx_xvdp2_d_w(__m256i in_h,__m256i in_l)774 static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
775 __m256i out;
776
777 out = __lasx_xvmulwev_d_w(in_h, in_l);
778 out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
779 return out;
780 }
781
782 /*
783 * =============================================================================
784 * Description : Dot product of halfword vector elements
785 * Arguments : Inputs - in_h, in_l
786 * Output - out
787 * Return Type - signed word
788 * Details : Unsigned halfword elements from in_h are multiplied with
789 * signed halfword elements from in_l producing a result
790 * twice the size of input i.e. unsigned word.
791 * Multiplication result of adjacent odd-even elements
792 * are added to the out vector
793 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
794 * =============================================================================
795 */
__lasx_xvdp2_w_hu_h(__m256i in_h,__m256i in_l)796 static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
797 __m256i out;
798
799 out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
800 out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
801 return out;
802 }
803
804 /*
805 * =============================================================================
806 * Description : Dot product & addition of byte vector elements
807 * Arguments : Inputs - in_h, in_l
808 * Output - out
809 * Return Type - halfword
810 * Details : Signed byte elements from in_h are multiplied with
811 * signed byte elements from in_l producing a result
812 * twice the size of input i.e. signed halfword.
813 * Then this multiplied results of adjacent odd-even elements
814 * are added to the in_c vector.
815 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
816 * =============================================================================
817 */
__lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h,__m256i in_l)818 static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
819 __m256i in_l) {
820 __m256i out;
821
822 out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
823 out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
824 return out;
825 }
826
827 /*
828 * =============================================================================
829 * Description : Dot product & addition of byte vector elements
830 * Arguments : Inputs - in_h, in_l
831 * Output - out
832 * Return Type - halfword
833 * Details : Unsigned byte elements from in_h are multiplied with
834 * unsigned byte elements from in_l producing a result
835 * twice the size of input i.e. signed halfword.
836 * Then this multiplied results of adjacent odd-even elements
837 * are added to the in_c vector.
838 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
839 * =============================================================================
840 */
__lasx_xvdp2add_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)841 static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
842 __m256i in_l) {
843 __m256i out;
844
845 out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
846 out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
847 return out;
848 }
849
850 /*
851 * =============================================================================
852 * Description : Dot product & addition of byte vector elements
853 * Arguments : Inputs - in_h, in_l
854 * Output - out
855 * Return Type - halfword
856 * Details : Unsigned byte elements from in_h are multiplied with
857 * signed byte elements from in_l producing a result
858 * twice the size of input i.e. signed halfword.
859 * Then this multiplied results of adjacent odd-even elements
860 * are added to the in_c vector.
861 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
862 * =============================================================================
863 */
__lasx_xvdp2add_h_bu_b(__m256i in_c,__m256i in_h,__m256i in_l)864 static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
865 __m256i in_l) {
866 __m256i out;
867
868 out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
869 out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
870 return out;
871 }
872
873 /*
874 * =============================================================================
875 * Description : Dot product of halfword vector elements
876 * Arguments : Inputs - in_c, in_h, in_l
877 * Output - out
878 * Return Type - per RTYPE
879 * Details : Signed halfword elements from in_h are multiplied with
880 * signed halfword elements from in_l producing a result
881 * twice the size of input i.e. signed word.
882 * Multiplication result of adjacent odd-even elements
883 * are added to the in_c vector.
884 * Example : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
885 * in_c : 1,2,3,4, 1,2,3,4
886 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
887 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
888 * out : 23,40,41,26, 23,40,41,26
889 * =============================================================================
890 */
__lasx_xvdp2add_w_h(__m256i in_c,__m256i in_h,__m256i in_l)891 static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
892 __m256i in_l) {
893 __m256i out;
894
895 out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
896 out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
897 return out;
898 }
899
900 /*
901 * =============================================================================
902 * Description : Dot product of halfword vector elements
903 * Arguments : Inputs - in_c, in_h, in_l
904 * Output - out
905 * Return Type - signed word
906 * Details : Unsigned halfword elements from in_h are multiplied with
907 * unsigned halfword elements from in_l producing a result
908 * twice the size of input i.e. signed word.
909 * Multiplication result of adjacent odd-even elements
910 * are added to the in_c vector.
911 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
912 * =============================================================================
913 */
__lasx_xvdp2add_w_hu(__m256i in_c,__m256i in_h,__m256i in_l)914 static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
915 __m256i in_l) {
916 __m256i out;
917
918 out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
919 out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
920 return out;
921 }
922
923 /*
924 * =============================================================================
925 * Description : Dot product of halfword vector elements
926 * Arguments : Inputs - in_c, in_h, in_l
927 * Output - out
928 * Return Type - signed word
929 * Details : Unsigned halfword elements from in_h are multiplied with
930 * signed halfword elements from in_l producing a result
931 * twice the size of input i.e. signed word.
932 * Multiplication result of adjacent odd-even elements
933 * are added to the in_c vector
934 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
935 * =============================================================================
936 */
__lasx_xvdp2add_w_hu_h(__m256i in_c,__m256i in_h,__m256i in_l)937 static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
938 __m256i in_l) {
939 __m256i out;
940
941 out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
942 out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
943 return out;
944 }
945
946 /*
947 * =============================================================================
948 * Description : Vector Unsigned Dot Product and Subtract
949 * Arguments : Inputs - in_c, in_h, in_l
950 * Output - out
951 * Return Type - signed halfword
952 * Details : Unsigned byte elements from in_h are multiplied with
953 * unsigned byte elements from in_l producing a result
954 * twice the size of input i.e. signed halfword.
955 * Multiplication result of adjacent odd-even elements
956 * are added together and subtracted from double width elements
957 * in_c vector.
958 * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
959 * =============================================================================
960 */
__lasx_xvdp2sub_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)961 static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
962 __m256i in_l) {
963 __m256i out;
964
965 out = __lasx_xvmulwev_h_bu(in_h, in_l);
966 out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
967 out = __lasx_xvsub_h(in_c, out);
968 return out;
969 }
970
971 /*
972 * =============================================================================
973 * Description : Vector Signed Dot Product and Subtract
974 * Arguments : Inputs - in_c, in_h, in_l
975 * Output - out
976 * Return Type - signed word
977 * Details : Signed halfword elements from in_h are multiplied with
978 * Signed halfword elements from in_l producing a result
979 * twice the size of input i.e. signed word.
980 * Multiplication result of adjacent odd-even elements
981 * are added together and subtracted from double width elements
982 * in_c vector.
983 * Example : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
984 * in_c : 0,0,0,0, 0,0,0,0
985 * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
986 * in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
987 * out : -7,-3,0,0, 0,-1,0,-1
988 * =============================================================================
989 */
__lasx_xvdp2sub_w_h(__m256i in_c,__m256i in_h,__m256i in_l)990 static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
991 __m256i in_l) {
992 __m256i out;
993
994 out = __lasx_xvmulwev_w_h(in_h, in_l);
995 out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
996 out = __lasx_xvsub_w(in_c, out);
997 return out;
998 }
999
1000 /*
1001 * =============================================================================
1002 * Description : Dot product of halfword vector elements
1003 * Arguments : Inputs - in_h, in_l
1004 * Output - out
1005 * Return Type - signed word
1006 * Details : Signed halfword elements from in_h are multiplied with
1007 * signed halfword elements from in_l producing a result
1008 * four times the size of input i.e. signed doubleword.
1009 * Then this multiplication results of four adjacent elements
1010 * are added together and stored to the out vector.
1011 * Example : out = __lasx_xvdp4_d_h(in_h, in_l)
1012 * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
1013 * in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
1014 * out : -2,0,1,1
1015 * =============================================================================
1016 */
__lasx_xvdp4_d_h(__m256i in_h,__m256i in_l)1017 static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
1018 __m256i out;
1019
1020 out = __lasx_xvmulwev_w_h(in_h, in_l);
1021 out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1022 out = __lasx_xvhaddw_d_w(out, out);
1023 return out;
1024 }
1025
1026 /*
1027 * =============================================================================
1028 * Description : The high half of the vector elements are expanded and
1029 * added after being doubled.
1030 * Arguments : Inputs - in_h, in_l
1031 * Output - out
1032 * Details : The in_h vector and the in_l vector are added after the
1033 * higher half of the two-fold sign extension (signed byte
1034 * to signed halfword) and stored to the out vector.
1035 * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l)
1036 * =============================================================================
1037 */
__lasx_xvaddwh_h_b(__m256i in_h,__m256i in_l)1038 static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
1039 __m256i out;
1040
1041 out = __lasx_xvilvh_b(in_h, in_l);
1042 out = __lasx_xvhaddw_h_b(out, out);
1043 return out;
1044 }
1045
1046 /*
1047 * =============================================================================
1048 * Description : The high half of the vector elements are expanded and
1049 * added after being doubled.
1050 * Arguments : Inputs - in_h, in_l
1051 * Output - out
1052 * Details : The in_h vector and the in_l vector are added after the
1053 * higher half of the two-fold sign extension (signed halfword
1054 * to signed word) and stored to the out vector.
1055 * Example : out = __lasx_xvaddwh_w_h(in_h, in_l)
1056 * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1057 * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1058 * out : 1,0,0,-1, 1,0,0, 2
1059 * =============================================================================
1060 */
__lasx_xvaddwh_w_h(__m256i in_h,__m256i in_l)1061 static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
1062 __m256i out;
1063
1064 out = __lasx_xvilvh_h(in_h, in_l);
1065 out = __lasx_xvhaddw_w_h(out, out);
1066 return out;
1067 }
1068
1069 /*
1070 * =============================================================================
1071 * Description : The low half of the vector elements are expanded and
1072 * added after being doubled.
1073 * Arguments : Inputs - in_h, in_l
1074 * Output - out
1075 * Details : The in_h vector and the in_l vector are added after the
1076 * lower half of the two-fold sign extension (signed byte
1077 * to signed halfword) and stored to the out vector.
1078 * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1079 * =============================================================================
1080 */
__lasx_xvaddwl_h_b(__m256i in_h,__m256i in_l)1081 static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
1082 __m256i out;
1083
1084 out = __lasx_xvilvl_b(in_h, in_l);
1085 out = __lasx_xvhaddw_h_b(out, out);
1086 return out;
1087 }
1088
1089 /*
1090 * =============================================================================
1091 * Description : The low half of the vector elements are expanded and
1092 * added after being doubled.
1093 * Arguments : Inputs - in_h, in_l
1094 * Output - out
1095 * Details : The in_h vector and the in_l vector are added after the
1096 * lower half of the two-fold sign extension (signed halfword
1097 * to signed word) and stored to the out vector.
1098 * Example : out = __lasx_xvaddwl_w_h(in_h, in_l)
1099 * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1100 * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1101 * out : 5,-1,4,2, 1,0,2,-1
1102 * =============================================================================
1103 */
__lasx_xvaddwl_w_h(__m256i in_h,__m256i in_l)1104 static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
1105 __m256i out;
1106
1107 out = __lasx_xvilvl_h(in_h, in_l);
1108 out = __lasx_xvhaddw_w_h(out, out);
1109 return out;
1110 }
1111
1112 /*
1113 * =============================================================================
1114 * Description : The low half of the vector elements are expanded and
1115 * added after being doubled.
1116 * Arguments : Inputs - in_h, in_l
1117 * Output - out
1118 * Details : The out vector and the out vector are added after the
1119 * lower half of the two-fold zero extension (unsigned byte
1120 * to unsigned halfword) and stored to the out vector.
1121 * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1122 * =============================================================================
1123 */
__lasx_xvaddwl_h_bu(__m256i in_h,__m256i in_l)1124 static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
1125 __m256i out;
1126
1127 out = __lasx_xvilvl_b(in_h, in_l);
1128 out = __lasx_xvhaddw_hu_bu(out, out);
1129 return out;
1130 }
1131
1132 /*
1133 * =============================================================================
1134 * Description : The low half of the vector elements are expanded and
1135 * added after being doubled.
1136 * Arguments : Inputs - in_h, in_l
1137 * Output - out
1138 * Details : The in_l vector after double zero extension (unsigned byte to
1139 * signed halfword),added to the in_h vector.
1140 * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
1141 * =============================================================================
1142 */
__lasx_xvaddw_h_h_bu(__m256i in_h,__m256i in_l)1143 static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
1144 __m256i out;
1145
1146 out = __lasx_xvsllwil_hu_bu(in_l, 0);
1147 out = __lasx_xvadd_h(in_h, out);
1148 return out;
1149 }
1150
1151 /*
1152 * =============================================================================
1153 * Description : The low half of the vector elements are expanded and
1154 * added after being doubled.
1155 * Arguments : Inputs - in_h, in_l
1156 * Output - out
1157 * Details : The in_l vector after double sign extension (signed halfword to
1158 * signed word), added to the in_h vector.
1159 * Example : out = __lasx_xvaddw_w_w_h(in_h, in_l)
1160 * in_h : 0, 1,0,0, -1,0,0,1,
1161 * in_l : 2,-1,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1,
1162 * out : 2, 0,1,2, -1,0,1,1,
1163 * =============================================================================
1164 */
__lasx_xvaddw_w_w_h(__m256i in_h,__m256i in_l)1165 static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
1166 __m256i out;
1167
1168 out = __lasx_xvsllwil_w_h(in_l, 0);
1169 out = __lasx_xvadd_w(in_h, out);
1170 return out;
1171 }
1172
1173 /*
1174 * =============================================================================
1175 * Description : Multiplication and addition calculation after expansion
1176 * of the lower half of the vector.
1177 * Arguments : Inputs - in_c, in_h, in_l
1178 * Output - out
1179 * Details : The in_h vector and the in_l vector are multiplied after
1180 * the lower half of the two-fold sign extension (signed halfword
1181 * to signed word), and the result is added to the vector in_c,
1182 * then stored to the out vector.
1183 * Example : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1184 * in_c : 1,2,3,4, 5,6,7,8
1185 * in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
1186 * in_l : 200, 300, 400, 500, 2000, 3000, 4000, 5000,
1187 * -200,-300,-400,-500, -2000,-3000,-4000,-5000
1188 * out : 201, 602,1203,2004, -995, -1794,-2793,-3992
1189 * =============================================================================
1190 */
__lasx_xvmaddwl_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1191 static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
1192 __m256i in_l) {
1193 __m256i tmp0, tmp1, out;
1194
1195 tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1196 tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1197 tmp0 = __lasx_xvmul_w(tmp0, tmp1);
1198 out = __lasx_xvadd_w(tmp0, in_c);
1199 return out;
1200 }
1201
1202 /*
1203 * =============================================================================
1204 * Description : Multiplication and addition calculation after expansion
1205 * of the higher half of the vector.
1206 * Arguments : Inputs - in_c, in_h, in_l
1207 * Output - out
1208 * Details : The in_h vector and the in_l vector are multiplied after
1209 * the higher half of the two-fold sign extension (signed
1210 * halfword to signed word), and the result is added to
1211 * the vector in_c, then stored to the out vector.
1212 * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1213 * =============================================================================
1214 */
__lasx_xvmaddwh_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1215 static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
1216 __m256i in_l) {
1217 __m256i tmp0, tmp1, out;
1218
1219 tmp0 = __lasx_xvilvh_h(in_h, in_h);
1220 tmp1 = __lasx_xvilvh_h(in_l, in_l);
1221 tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
1222 out = __lasx_xvadd_w(tmp0, in_c);
1223 return out;
1224 }
1225
1226 /*
1227 * =============================================================================
1228 * Description : Multiplication calculation after expansion of the lower
1229 * half of the vector.
1230 * Arguments : Inputs - in_h, in_l
1231 * Output - out
1232 * Details : The in_h vector and the in_l vector are multiplied after
1233 * the lower half of the two-fold sign extension (signed
1234 * halfword to signed word), then stored to the out vector.
1235 * Example : out = __lasx_xvmulwl_w_h(in_h, in_l)
1236 * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1237 * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1238 * out : 6,1,3,0, 0,0,1,0
1239 * =============================================================================
1240 */
__lasx_xvmulwl_w_h(__m256i in_h,__m256i in_l)1241 static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
1242 __m256i tmp0, tmp1, out;
1243
1244 tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1245 tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1246 out = __lasx_xvmul_w(tmp0, tmp1);
1247 return out;
1248 }
1249
1250 /*
1251 * =============================================================================
1252 * Description : Multiplication calculation after expansion of the lower
1253 * half of the vector.
1254 * Arguments : Inputs - in_h, in_l
1255 * Output - out
1256 * Details : The in_h vector and the in_l vector are multiplied after
1257 * the lower half of the two-fold sign extension (signed
1258 * halfword to signed word), then stored to the out vector.
1259 * Example : out = __lasx_xvmulwh_w_h(in_h, in_l)
1260 * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1261 * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1262 * out : 0,0,0,0, 0,0,0,1
1263 * =============================================================================
1264 */
__lasx_xvmulwh_w_h(__m256i in_h,__m256i in_l)1265 static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
1266 __m256i tmp0, tmp1, out;
1267
1268 tmp0 = __lasx_xvilvh_h(in_h, in_h);
1269 tmp1 = __lasx_xvilvh_h(in_l, in_l);
1270 out = __lasx_xvmulwev_w_h(tmp0, tmp1);
1271 return out;
1272 }
1273
1274 /*
1275 * =============================================================================
1276 * Description : The low half of the vector elements are added to the high half
1277 * after being doubled, then saturated.
1278 * Arguments : Inputs - in_h, in_l
1279 * Output - out
1280 * Details : The in_h vector adds the in_l vector after the lower half of
1281 * the two-fold zero extension (unsigned byte to unsigned
1282 * halfword) and then saturated. The results are stored to the out
1283 * vector.
1284 * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
1285 * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
1286 * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
1287 * 0,0,0,1
1288 * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
1289 * =============================================================================
1290 */
__lasx_xvsaddw_hu_hu_bu(__m256i in_h,__m256i in_l)1291 static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
1292 __m256i tmp1, out;
1293 __m256i zero = { 0 };
1294
1295 tmp1 = __lasx_xvilvl_b(zero, in_l);
1296 out = __lasx_xvsadd_hu(in_h, tmp1);
1297 return out;
1298 }
1299
1300 /*
1301 * =============================================================================
1302 * Description : Clip all halfword elements of input vector between min & max
1303 * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1304 * Arguments : Inputs - in (input vector)
1305 * - min (min threshold)
1306 * - max (max threshold)
1307 * Outputs - in (output vector with clipped elements)
1308 * Return Type - signed halfword
1309 * Example : out = __lasx_xvclip_h(in, min, max)
1310 * in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
1311 * min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
1312 * max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
1313 * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
1314 * =============================================================================
1315 */
__lasx_xvclip_h(__m256i in,__m256i min,__m256i max)1316 static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
1317 __m256i out;
1318
1319 out = __lasx_xvmax_h(min, in);
1320 out = __lasx_xvmin_h(max, out);
1321 return out;
1322 }
1323
1324 /*
1325 * =============================================================================
1326 * Description : Clip all signed halfword elements of input vector
1327 * between 0 & 255
1328 * Arguments : Inputs - in (input vector)
1329 * Outputs - out (output vector with clipped elements)
1330 * Return Type - signed halfword
1331 * Example : See out = __lasx_xvclip255_w(in)
1332 * =============================================================================
1333 */
__lasx_xvclip255_h(__m256i in)1334 static inline __m256i __lasx_xvclip255_h(__m256i in) {
1335 __m256i out;
1336
1337 out = __lasx_xvmaxi_h(in, 0);
1338 out = __lasx_xvsat_hu(out, 7);
1339 return out;
1340 }
1341
1342 /*
1343 * =============================================================================
1344 * Description : Clip all signed word elements of input vector
1345 * between 0 & 255
1346 * Arguments : Inputs - in (input vector)
1347 * Output - out (output vector with clipped elements)
1348 * Return Type - signed word
1349 * Example : out = __lasx_xvclip255_w(in)
1350 * in : -8,255,280,249, -8,255,280,249
1351 * out : 0,255,255,249, 0,255,255,249
1352 * =============================================================================
1353 */
__lasx_xvclip255_w(__m256i in)1354 static inline __m256i __lasx_xvclip255_w(__m256i in) {
1355 __m256i out;
1356
1357 out = __lasx_xvmaxi_w(in, 0);
1358 out = __lasx_xvsat_wu(out, 7);
1359 return out;
1360 }
1361
1362 /*
1363 * =============================================================================
1364 * Description : Indexed halfword element values are replicated to all
1365 * elements in output vector. If 'idx < 8' use xvsplati_l_*,
1366 * if 'idx >= 8' use xvsplati_h_*.
1367 * Arguments : Inputs - in, idx
1368 * Output - out
1369 * Details : Idx element value from in vector is replicated to all
1370 * elements in out vector.
1371 * Valid index range for halfword operation is 0-7
1372 * Example : out = __lasx_xvsplati_l_h(in, idx)
1373 * in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
1374 * idx : 0x02
1375 * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
1376 * =============================================================================
1377 */
__lasx_xvsplati_l_h(__m256i in,int idx)1378 static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
1379 __m256i out;
1380
1381 out = __lasx_xvpermi_q(in, in, 0x02);
1382 out = __lasx_xvreplve_h(out, idx);
1383 return out;
1384 }
1385
1386 /*
1387 * =============================================================================
1388 * Description : Indexed halfword element values are replicated to all
1389 * elements in output vector. If 'idx < 8' use xvsplati_l_*,
1390 * if 'idx >= 8' use xvsplati_h_*.
1391 * Arguments : Inputs - in, idx
1392 * Output - out
1393 * Details : Idx element value from in vector is replicated to all
1394 * elements in out vector.
1395 * Valid index range for halfword operation is 0-7
1396 * Example : out = __lasx_xvsplati_h_h(in, idx)
1397 * in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
1398 * idx : 0x09
1399 * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1400 * =============================================================================
1401 */
__lasx_xvsplati_h_h(__m256i in,int idx)1402 static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
1403 __m256i out;
1404
1405 out = __lasx_xvpermi_q(in, in, 0x13);
1406 out = __lasx_xvreplve_h(out, idx);
1407 return out;
1408 }
1409
1410 /*
1411 * =============================================================================
1412 * Description : Transpose 4x4 block with double-word elements in vectors
1413 * Arguments : Inputs - _in0, _in1, _in2, _in3
1414 * Outputs - _out0, _out1, _out2, _out3
1415 * Example : LASX_TRANSPOSE4x4_D
1416 * _in0 : 1,2,3,4
1417 * _in1 : 1,2,3,4
1418 * _in2 : 1,2,3,4
1419 * _in3 : 1,2,3,4
1420 *
1421 * _out0 : 1,1,1,1
1422 * _out1 : 2,2,2,2
1423 * _out2 : 3,3,3,3
1424 * _out3 : 4,4,4,4
1425 * =============================================================================
1426 */
1427 #define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1428 _out3) \
1429 { \
1430 __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
1431 _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
1432 _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
1433 _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
1434 _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
1435 _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
1436 _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
1437 _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
1438 _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
1439 }
1440
1441 /*
1442 * =============================================================================
1443 * Description : Transpose 8x8 block with word elements in vectors
1444 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1445 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1446 * _out7
1447 * Example : LASX_TRANSPOSE8x8_W
1448 * _in0 : 1,2,3,4,5,6,7,8
1449 * _in1 : 2,2,3,4,5,6,7,8
1450 * _in2 : 3,2,3,4,5,6,7,8
1451 * _in3 : 4,2,3,4,5,6,7,8
1452 * _in4 : 5,2,3,4,5,6,7,8
1453 * _in5 : 6,2,3,4,5,6,7,8
1454 * _in6 : 7,2,3,4,5,6,7,8
1455 * _in7 : 8,2,3,4,5,6,7,8
1456 *
1457 * _out0 : 1,2,3,4,5,6,7,8
1458 * _out1 : 2,2,2,2,2,2,2,2
1459 * _out2 : 3,3,3,3,3,3,3,3
1460 * _out3 : 4,4,4,4,4,4,4,4
1461 * _out4 : 5,5,5,5,5,5,5,5
1462 * _out5 : 6,6,6,6,6,6,6,6
1463 * _out6 : 7,7,7,7,7,7,7,7
1464 * _out7 : 8,8,8,8,8,8,8,8
1465 * =============================================================================
1466 */
1467 #define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1468 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1469 _out7) \
1470 { \
1471 __m256i _s0_m, _s1_m; \
1472 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1473 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1474 \
1475 _s0_m = __lasx_xvilvl_w(_in2, _in0); \
1476 _s1_m = __lasx_xvilvl_w(_in3, _in1); \
1477 _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1478 _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1479 _s0_m = __lasx_xvilvh_w(_in2, _in0); \
1480 _s1_m = __lasx_xvilvh_w(_in3, _in1); \
1481 _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1482 _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1483 _s0_m = __lasx_xvilvl_w(_in6, _in4); \
1484 _s1_m = __lasx_xvilvl_w(_in7, _in5); \
1485 _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1486 _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1487 _s0_m = __lasx_xvilvh_w(_in6, _in4); \
1488 _s1_m = __lasx_xvilvh_w(_in7, _in5); \
1489 _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1490 _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1491 _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
1492 _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
1493 _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
1494 _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
1495 _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
1496 _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
1497 _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
1498 _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
1499 }
1500
1501 /*
1502 * =============================================================================
1503 * Description : Transpose input 16x8 byte block
1504 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1505 * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1506 * (input 16x8 byte block)
1507 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1508 * _out7 (output 8x16 byte block)
1509 * Details : The rows of the matrix become columns, and the columns become
1510 * rows.
1511 * Example : See LASX_TRANSPOSE16x8_H
1512 * =============================================================================
1513 */
1514 #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1515 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
1516 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1517 _out6, _out7) \
1518 { \
1519 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1520 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1521 \
1522 _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1523 _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1524 _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1525 _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1526 _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
1527 _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
1528 _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
1529 _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
1530 _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1531 _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1532 _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1533 _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1534 _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
1535 _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
1536 _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
1537 _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
1538 _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
1539 _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
1540 _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
1541 _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
1542 _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
1543 _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
1544 _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
1545 _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
1546 _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
1547 _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
1548 _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
1549 _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
1550 _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
1551 _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
1552 _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
1553 _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
1554 }
1555
1556 /*
1557 * =============================================================================
1558 * Description : Transpose input 16x8 byte block
1559 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1560 * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1561 * (input 16x8 byte block)
1562 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1563 * _out7 (output 8x16 byte block)
1564 * Details : The rows of the matrix become columns, and the columns become
1565 * rows.
1566 * Example : LASX_TRANSPOSE16x8_H
1567 * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1568 * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1569 * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1570 * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1571 * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1572 * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1573 * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1574 * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1575 * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1576 * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1577 * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1578 * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1579 * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1580 * _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1581 * _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1582 * _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1583 *
1584 * _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
1585 * _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1586 * _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
1587 * _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
1588 * _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
1589 * _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1590 * _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
1591 * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
1592 * =============================================================================
1593 */
1594 #define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1595 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
1596 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1597 _out6, _out7) \
1598 { \
1599 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1600 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1601 __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
1602 \
1603 _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
1604 _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
1605 _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
1606 _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
1607 _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
1608 _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
1609 _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
1610 _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
1611 _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1612 _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1613 _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1614 _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1615 _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1616 _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1617 _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1618 _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1619 _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1620 _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1621 _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1622 _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1623 _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1624 _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1625 _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1626 _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1627 _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1628 _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1629 _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1630 _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1631 \
1632 _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
1633 _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
1634 _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
1635 _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
1636 _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
1637 _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
1638 _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
1639 _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
1640 _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1641 _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1642 _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1643 _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1644 _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1645 _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1646 _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1647 _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1648 _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1649 _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1650 _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1651 _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1652 _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1653 _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1654 _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1655 _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1656 _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1657 _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1658 _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1659 _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1660 }
1661
1662 /*
1663 * =============================================================================
1664 * Description : Transpose 4x4 block with halfword elements in vectors
1665 * Arguments : Inputs - _in0, _in1, _in2, _in3
1666 * Outputs - _out0, _out1, _out2, _out3
1667 * Return Type - signed halfword
1668 * Details : The rows of the matrix become columns, and the columns become
1669 * rows.
1670 * Example : See LASX_TRANSPOSE8x8_H
1671 * =============================================================================
1672 */
1673 #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1674 _out3) \
1675 { \
1676 __m256i _s0_m, _s1_m; \
1677 \
1678 _s0_m = __lasx_xvilvl_h(_in1, _in0); \
1679 _s1_m = __lasx_xvilvl_h(_in3, _in2); \
1680 _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
1681 _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
1682 _out1 = __lasx_xvilvh_d(_out0, _out0); \
1683 _out3 = __lasx_xvilvh_d(_out2, _out2); \
1684 }
1685
1686 /*
1687 * =============================================================================
1688 * Description : Transpose input 8x8 byte block
1689 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1690 * (input 8x8 byte block)
1691 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1692 * _out7 (output 8x8 byte block)
1693 * Example : See LASX_TRANSPOSE8x8_H
1694 * =============================================================================
1695 */
1696 #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1697 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1698 _out7) \
1699 { \
1700 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1701 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1702 _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1703 _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1704 _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1705 _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1706 _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1707 _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1708 _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1709 _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1710 _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
1711 _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
1712 _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
1713 _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
1714 _out1 = __lasx_xvbsrl_v(_out0, 8); \
1715 _out3 = __lasx_xvbsrl_v(_out2, 8); \
1716 _out5 = __lasx_xvbsrl_v(_out4, 8); \
1717 _out7 = __lasx_xvbsrl_v(_out6, 8); \
1718 }
1719
1720 /*
1721 * =============================================================================
1722 * Description : Transpose 8x8 block with halfword elements in vectors.
1723 * Arguments : Inputs - _in0, _in1, ~
1724 * Outputs - _out0, _out1, ~
1725 * Details : The rows of the matrix become columns, and the columns become
1726 * rows.
1727 * Example : LASX_TRANSPOSE8x8_H
1728 * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1729 * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1730 * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1731 * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1732 * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1733 * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1734 * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1735 * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1736 *
1737 * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
1738 * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1739 * _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
1740 * _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
1741 * _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
1742 * _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
1743 * _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
1744 * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
1745 * =============================================================================
1746 */
1747 #define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1748 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1749 _out7) \
1750 { \
1751 __m256i _s0_m, _s1_m; \
1752 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1753 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1754 \
1755 _s0_m = __lasx_xvilvl_h(_in6, _in4); \
1756 _s1_m = __lasx_xvilvl_h(_in7, _in5); \
1757 _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1758 _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1759 _s0_m = __lasx_xvilvh_h(_in6, _in4); \
1760 _s1_m = __lasx_xvilvh_h(_in7, _in5); \
1761 _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1762 _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1763 \
1764 _s0_m = __lasx_xvilvl_h(_in2, _in0); \
1765 _s1_m = __lasx_xvilvl_h(_in3, _in1); \
1766 _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1767 _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1768 _s0_m = __lasx_xvilvh_h(_in2, _in0); \
1769 _s1_m = __lasx_xvilvh_h(_in3, _in1); \
1770 _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1771 _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1772 \
1773 _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
1774 _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
1775 _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
1776 _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
1777 _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
1778 _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
1779 _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
1780 _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
1781 }
1782
1783 /*
1784 * =============================================================================
1785 * Description : Butterfly of 4 input vectors
1786 * Arguments : Inputs - _in0, _in1, _in2, _in3
1787 * Outputs - _out0, _out1, _out2, _out3
1788 * Details : Butterfly operation
1789 * Example : LASX_BUTTERFLY_4
1790 * _out0 = _in0 + _in3;
1791 * _out1 = _in1 + _in2;
1792 * _out2 = _in1 - _in2;
1793 * _out3 = _in0 - _in3;
1794 * =============================================================================
1795 */
1796 #define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1797 { \
1798 _out0 = __lasx_xvadd_b(_in0, _in3); \
1799 _out1 = __lasx_xvadd_b(_in1, _in2); \
1800 _out2 = __lasx_xvsub_b(_in1, _in2); \
1801 _out3 = __lasx_xvsub_b(_in0, _in3); \
1802 }
1803 #define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1804 { \
1805 _out0 = __lasx_xvadd_h(_in0, _in3); \
1806 _out1 = __lasx_xvadd_h(_in1, _in2); \
1807 _out2 = __lasx_xvsub_h(_in1, _in2); \
1808 _out3 = __lasx_xvsub_h(_in0, _in3); \
1809 }
1810 #define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1811 { \
1812 _out0 = __lasx_xvadd_w(_in0, _in3); \
1813 _out1 = __lasx_xvadd_w(_in1, _in2); \
1814 _out2 = __lasx_xvsub_w(_in1, _in2); \
1815 _out3 = __lasx_xvsub_w(_in0, _in3); \
1816 }
1817 #define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1818 { \
1819 _out0 = __lasx_xvadd_d(_in0, _in3); \
1820 _out1 = __lasx_xvadd_d(_in1, _in2); \
1821 _out2 = __lasx_xvsub_d(_in1, _in2); \
1822 _out3 = __lasx_xvsub_d(_in0, _in3); \
1823 }
1824
1825 /*
1826 * =============================================================================
1827 * Description : Butterfly of 8 input vectors
1828 * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
1829 * Outputs - _out0, _out1, _out2, _out3, ~
1830 * Details : Butterfly operation
1831 * Example : LASX_BUTTERFLY_8
1832 * _out0 = _in0 + _in7;
1833 * _out1 = _in1 + _in6;
1834 * _out2 = _in2 + _in5;
1835 * _out3 = _in3 + _in4;
1836 * _out4 = _in3 - _in4;
1837 * _out5 = _in2 - _in5;
1838 * _out6 = _in1 - _in6;
1839 * _out7 = _in0 - _in7;
1840 * =============================================================================
1841 */
1842 #define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1843 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1844 _out7) \
1845 { \
1846 _out0 = __lasx_xvadd_b(_in0, _in7); \
1847 _out1 = __lasx_xvadd_b(_in1, _in6); \
1848 _out2 = __lasx_xvadd_b(_in2, _in5); \
1849 _out3 = __lasx_xvadd_b(_in3, _in4); \
1850 _out4 = __lasx_xvsub_b(_in3, _in4); \
1851 _out5 = __lasx_xvsub_b(_in2, _in5); \
1852 _out6 = __lasx_xvsub_b(_in1, _in6); \
1853 _out7 = __lasx_xvsub_b(_in0, _in7); \
1854 }
1855
1856 #define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1857 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1858 _out7) \
1859 { \
1860 _out0 = __lasx_xvadd_h(_in0, _in7); \
1861 _out1 = __lasx_xvadd_h(_in1, _in6); \
1862 _out2 = __lasx_xvadd_h(_in2, _in5); \
1863 _out3 = __lasx_xvadd_h(_in3, _in4); \
1864 _out4 = __lasx_xvsub_h(_in3, _in4); \
1865 _out5 = __lasx_xvsub_h(_in2, _in5); \
1866 _out6 = __lasx_xvsub_h(_in1, _in6); \
1867 _out7 = __lasx_xvsub_h(_in0, _in7); \
1868 }
1869
1870 #define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1871 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1872 _out7) \
1873 { \
1874 _out0 = __lasx_xvadd_w(_in0, _in7); \
1875 _out1 = __lasx_xvadd_w(_in1, _in6); \
1876 _out2 = __lasx_xvadd_w(_in2, _in5); \
1877 _out3 = __lasx_xvadd_w(_in3, _in4); \
1878 _out4 = __lasx_xvsub_w(_in3, _in4); \
1879 _out5 = __lasx_xvsub_w(_in2, _in5); \
1880 _out6 = __lasx_xvsub_w(_in1, _in6); \
1881 _out7 = __lasx_xvsub_w(_in0, _in7); \
1882 }
1883
1884 #define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1885 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1886 _out7) \
1887 { \
1888 _out0 = __lasx_xvadd_d(_in0, _in7); \
1889 _out1 = __lasx_xvadd_d(_in1, _in6); \
1890 _out2 = __lasx_xvadd_d(_in2, _in5); \
1891 _out3 = __lasx_xvadd_d(_in3, _in4); \
1892 _out4 = __lasx_xvsub_d(_in3, _in4); \
1893 _out5 = __lasx_xvsub_d(_in2, _in5); \
1894 _out6 = __lasx_xvsub_d(_in1, _in6); \
1895 _out7 = __lasx_xvsub_d(_in0, _in7); \
1896 }
1897
1898 #endif // LASX
1899
1900 /*
1901 * =============================================================================
1902 * Description : Print out elements in vector.
1903 * Arguments : Inputs - RTYPE, _element_num, _in0, _enter
1904 * Outputs -
1905 * Details : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
1906 * '_enter' is TRUE, prefix "\nVP:" will be added first.
1907 * Example : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
1908 * VP:1,2,3,4,
1909 * =============================================================================
1910 */
1911 #define VECT_PRINT(RTYPE, element_num, in0, enter) \
1912 { \
1913 RTYPE _tmp0 = (RTYPE)in0; \
1914 int _i = 0; \
1915 if (enter) printf("\nVP:"); \
1916 for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
1917 }
1918
1919 #endif /* LOONGSON_INTRINSICS_H */
1920