1 /*
2 * Copyright 2022 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H
12 #define INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H
13
14 /*
15 * Copyright (c) 2022 Loongson Technology Corporation Limited
16 * All rights reserved.
17 * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
18 * Xiwei Gu <guxiwei-hf@loongson.cn>
19 * Lu Wang <wanglu@loongson.cn>
20 *
21 * This file is a header file for loongarch builtin extension.
22 *
23 */
24
25 #ifndef LOONGSON_INTRINSICS_H
26 #define LOONGSON_INTRINSICS_H
27
28 /**
29 * MAJOR version: Macro usage changes.
30 * MINOR version: Add new functions, or bug fixes.
31 * MICRO version: Comment changes or implementation changes.
32 */
33 #define LSOM_VERSION_MAJOR 1
34 #define LSOM_VERSION_MINOR 1
35 #define LSOM_VERSION_MICRO 0
36
37 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
38 { \
39 _OUT0 = _INS(_IN0); \
40 _OUT1 = _INS(_IN1); \
41 }
42
43 #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
44 { \
45 _OUT0 = _INS(_IN0, _IN1); \
46 _OUT1 = _INS(_IN2, _IN3); \
47 }
48
49 #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
50 { \
51 _OUT0 = _INS(_IN0, _IN1, _IN2); \
52 _OUT1 = _INS(_IN3, _IN4, _IN5); \
53 }
54
55 #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
56 { \
57 DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
58 DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
59 }
60
61 #define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
62 _OUT1, _OUT2, _OUT3) \
63 { \
64 DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
65 DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
66 }
67
68 #define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
69 _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
70 { \
71 DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
72 DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
73 }
74
75 #ifdef __loongarch_sx
76 #include <lsxintrin.h>
77 /*
78 * =============================================================================
79 * Description : Dot product & addition of byte vector elements
80 * Arguments : Inputs - in_c, in_h, in_l
81 * Outputs - out
82 * Return Type - halfword
83 * Details : Signed byte elements from in_h are multiplied by
84 * signed byte elements from in_l, and then added adjacent to
85 * each other to get results with the twice size of input.
86 * Then the results plus to signed half-word elements from in_c.
87 * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
88 * in_c : 1,2,3,4, 1,2,3,4
89 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
90 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
91 * out : 23,40,41,26, 23,40,41,26
92 * =============================================================================
93 */
__lsx_vdp2add_h_b(__m128i in_c,__m128i in_h,__m128i in_l)94 static inline __m128i __lsx_vdp2add_h_b(__m128i in_c,
95 __m128i in_h,
96 __m128i in_l) {
97 __m128i out;
98
99 out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
100 out = __lsx_vmaddwod_h_b(out, in_h, in_l);
101 return out;
102 }
103
104 /*
105 * =============================================================================
106 * Description : Dot product & addition of byte vector elements
107 * Arguments : Inputs - in_c, in_h, in_l
108 * Outputs - out
109 * Return Type - halfword
110 * Details : Unsigned byte elements from in_h are multiplied by
111 * unsigned byte elements from in_l, and then added adjacent to
112 * each other to get results with the twice size of input.
113 * The results plus to signed half-word elements from in_c.
114 * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
115 * in_c : 1,2,3,4, 1,2,3,4
116 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
117 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
118 * out : 23,40,41,26, 23,40,41,26
119 * =============================================================================
120 */
__lsx_vdp2add_h_bu(__m128i in_c,__m128i in_h,__m128i in_l)121 static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c,
122 __m128i in_h,
123 __m128i in_l) {
124 __m128i out;
125
126 out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
127 out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
128 return out;
129 }
130
131 /*
132 * =============================================================================
133 * Description : Dot product & addition of byte vector elements
134 * Arguments : Inputs - in_c, in_h, in_l
135 * Outputs - out
136 * Return Type - halfword
137 * Details : Unsigned byte elements from in_h are multiplied by
138 * signed byte elements from in_l, and then added adjacent to
139 * each other to get results with the twice size of input.
140 * The results plus to signed half-word elements from in_c.
141 * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
142 * in_c : 1,1,1,1, 1,1,1,1
143 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
144 * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
145 * out : -4,-24,-60,-112, 6,26,62,114
146 * =============================================================================
147 */
__lsx_vdp2add_h_bu_b(__m128i in_c,__m128i in_h,__m128i in_l)148 static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c,
149 __m128i in_h,
150 __m128i in_l) {
151 __m128i out;
152
153 out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
154 out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
155 return out;
156 }
157
158 /*
159 * =============================================================================
160 * Description : Dot product & addition of half-word vector elements
161 * Arguments : Inputs - in_c, in_h, in_l
162 * Outputs - out
163 * Return Type - __m128i
164 * Details : Signed half-word elements from in_h are multiplied by
165 * signed half-word elements from in_l, and then added adjacent to
166 * each other to get results with the twice size of input.
167 * Then the results plus to signed word elements from in_c.
168 * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
169 * in_c : 1,2,3,4
170 * in_h : 1,2,3,4, 5,6,7,8
171 * in_l : 8,7,6,5, 4,3,2,1
172 * out : 23,40,41,26
173 * =============================================================================
174 */
__lsx_vdp2add_w_h(__m128i in_c,__m128i in_h,__m128i in_l)175 static inline __m128i __lsx_vdp2add_w_h(__m128i in_c,
176 __m128i in_h,
177 __m128i in_l) {
178 __m128i out;
179
180 out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
181 out = __lsx_vmaddwod_w_h(out, in_h, in_l);
182 return out;
183 }
184
185 /*
186 * =============================================================================
187 * Description : Dot product of byte vector elements
188 * Arguments : Inputs - in_h, in_l
189 * Outputs - out
190 * Return Type - halfword
191 * Details : Signed byte elements from in_h are multiplied by
192 * signed byte elements from in_l, and then added adjacent to
193 * each other to get results with the twice size of input.
194 * Example : out = __lsx_vdp2_h_b(in_h, in_l)
195 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
196 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
197 * out : 22,38,38,22, 22,38,38,22
198 * =============================================================================
199 */
__lsx_vdp2_h_b(__m128i in_h,__m128i in_l)200 static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
201 __m128i out;
202
203 out = __lsx_vmulwev_h_b(in_h, in_l);
204 out = __lsx_vmaddwod_h_b(out, in_h, in_l);
205 return out;
206 }
207
208 /*
209 * =============================================================================
210 * Description : Dot product of byte vector elements
211 * Arguments : Inputs - in_h, in_l
212 * Outputs - out
213 * Return Type - halfword
214 * Details : Unsigned byte elements from in_h are multiplied by
215 * unsigned byte elements from in_l, and then added adjacent to
216 * each other to get results with the twice size of input.
217 * Example : out = __lsx_vdp2_h_bu(in_h, in_l)
218 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
219 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
220 * out : 22,38,38,22, 22,38,38,22
221 * =============================================================================
222 */
__lsx_vdp2_h_bu(__m128i in_h,__m128i in_l)223 static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
224 __m128i out;
225
226 out = __lsx_vmulwev_h_bu(in_h, in_l);
227 out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
228 return out;
229 }
230
231 /*
232 * =============================================================================
233 * Description : Dot product of byte vector elements
234 * Arguments : Inputs - in_h, in_l
235 * Outputs - out
236 * Return Type - halfword
237 * Details : Unsigned byte elements from in_h are multiplied by
238 * signed byte elements from in_l, and then added adjacent to
239 * each other to get results with the twice size of input.
240 * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l)
241 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
242 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
243 * out : 22,38,38,22, 22,38,38,6
244 * =============================================================================
245 */
__lsx_vdp2_h_bu_b(__m128i in_h,__m128i in_l)246 static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
247 __m128i out;
248
249 out = __lsx_vmulwev_h_bu_b(in_h, in_l);
250 out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
251 return out;
252 }
253
254 /*
255 * =============================================================================
256 * Description : Dot product of byte vector elements
257 * Arguments : Inputs - in_h, in_l
258 * Outputs - out
259 * Return Type - halfword
260 * Details : Signed byte elements from in_h are multiplied by
261 * signed byte elements from in_l, and then added adjacent to
262 * each other to get results with the twice size of input.
263 * Example : out = __lsx_vdp2_w_h(in_h, in_l)
264 * in_h : 1,2,3,4, 5,6,7,8
265 * in_l : 8,7,6,5, 4,3,2,1
266 * out : 22,38,38,22
267 * =============================================================================
268 */
__lsx_vdp2_w_h(__m128i in_h,__m128i in_l)269 static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
270 __m128i out;
271
272 out = __lsx_vmulwev_w_h(in_h, in_l);
273 out = __lsx_vmaddwod_w_h(out, in_h, in_l);
274 return out;
275 }
276
277 /*
278 * =============================================================================
279 * Description : Clip all halfword elements of input vector between min & max
280 * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
281 * (_in))
282 * Arguments : Inputs - _in (input vector)
283 * - min (min threshold)
284 * - max (max threshold)
285 * Outputs - out (output vector with clipped elements)
286 * Return Type - signed halfword
287 * Example : out = __lsx_vclip_h(_in)
288 * _in : -8,2,280,249, -8,255,280,249
289 * min : 1,1,1,1, 1,1,1,1
290 * max : 9,9,9,9, 9,9,9,9
291 * out : 1,2,9,9, 1,9,9,9
292 * =============================================================================
293 */
__lsx_vclip_h(__m128i _in,__m128i min,__m128i max)294 static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
295 __m128i out;
296
297 out = __lsx_vmax_h(min, _in);
298 out = __lsx_vmin_h(max, out);
299 return out;
300 }
301
302 /*
303 * =============================================================================
304 * Description : Set each element of vector between 0 and 255
305 * Arguments : Inputs - _in
306 * Outputs - out
307 * Return Type - halfword
308 * Details : Signed byte elements from _in are clamped between 0 and 255.
309 * Example : out = __lsx_vclip255_h(_in)
310 * _in : -8,255,280,249, -8,255,280,249
311 * out : 0,255,255,249, 0,255,255,249
312 * =============================================================================
313 */
__lsx_vclip255_h(__m128i _in)314 static inline __m128i __lsx_vclip255_h(__m128i _in) {
315 __m128i out;
316
317 out = __lsx_vmaxi_h(_in, 0);
318 out = __lsx_vsat_hu(out, 7);
319 return out;
320 }
321
322 /*
323 * =============================================================================
324 * Description : Set each element of vector between 0 and 255
325 * Arguments : Inputs - _in
326 * Outputs - out
327 * Return Type - word
328 * Details : Signed byte elements from _in are clamped between 0 and 255.
329 * Example : out = __lsx_vclip255_w(_in)
330 * _in : -8,255,280,249
331 * out : 0,255,255,249
332 * =============================================================================
333 */
__lsx_vclip255_w(__m128i _in)334 static inline __m128i __lsx_vclip255_w(__m128i _in) {
335 __m128i out;
336
337 out = __lsx_vmaxi_w(_in, 0);
338 out = __lsx_vsat_wu(out, 7);
339 return out;
340 }
341
342 /*
343 * =============================================================================
344 * Description : Swap two variables
345 * Arguments : Inputs - _in0, _in1
346 * Outputs - _in0, _in1 (in-place)
347 * Details : Swapping of two input variables using xor
348 * Example : LSX_SWAP(_in0, _in1)
349 * _in0 : 1,2,3,4
350 * _in1 : 5,6,7,8
351 * _in0(out) : 5,6,7,8
352 * _in1(out) : 1,2,3,4
353 * =============================================================================
354 */
355 #define LSX_SWAP(_in0, _in1) \
356 { \
357 _in0 = __lsx_vxor_v(_in0, _in1); \
358 _in1 = __lsx_vxor_v(_in0, _in1); \
359 _in0 = __lsx_vxor_v(_in0, _in1); \
360 }
361
362 /*
363 * =============================================================================
364 * Description : Transpose 4x4 block with word elements in vectors
365 * Arguments : Inputs - in0, in1, in2, in3
366 * Outputs - out0, out1, out2, out3
367 * Details :
368 * Example :
369 * 1, 2, 3, 4 1, 5, 9,13
370 * 5, 6, 7, 8 to 2, 6,10,14
371 * 9,10,11,12 =====> 3, 7,11,15
372 * 13,14,15,16 4, 8,12,16
373 * =============================================================================
374 */
375 #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
376 { \
377 __m128i _t0, _t1, _t2, _t3; \
378 \
379 _t0 = __lsx_vilvl_w(_in1, _in0); \
380 _t1 = __lsx_vilvh_w(_in1, _in0); \
381 _t2 = __lsx_vilvl_w(_in3, _in2); \
382 _t3 = __lsx_vilvh_w(_in3, _in2); \
383 _out0 = __lsx_vilvl_d(_t2, _t0); \
384 _out1 = __lsx_vilvh_d(_t2, _t0); \
385 _out2 = __lsx_vilvl_d(_t3, _t1); \
386 _out3 = __lsx_vilvh_d(_t3, _t1); \
387 }
388
389 /*
390 * =============================================================================
391 * Description : Transpose 8x8 block with byte elements in vectors
392 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
393 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
394 * _out7
395 * Details : The rows of the matrix become columns, and the columns
396 * become rows.
397 * Example : LSX_TRANSPOSE8x8_B
398 * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
399 * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
400 * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
401 * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
402 * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
403 * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
404 * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
405 * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
406 *
407 * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
408 * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
409 * _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
410 * _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
411 * _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
412 * _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
413 * _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
414 * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
415 * =============================================================================
416 */
417 #define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
418 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
419 _out7) \
420 { \
421 __m128i zero = {0}; \
422 __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; \
423 __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
424 \
425 _t0 = __lsx_vilvl_b(_in2, _in0); \
426 _t1 = __lsx_vilvl_b(_in3, _in1); \
427 _t2 = __lsx_vilvl_b(_in6, _in4); \
428 _t3 = __lsx_vilvl_b(_in7, _in5); \
429 _t4 = __lsx_vilvl_b(_t1, _t0); \
430 _t5 = __lsx_vilvh_b(_t1, _t0); \
431 _t6 = __lsx_vilvl_b(_t3, _t2); \
432 _t7 = __lsx_vilvh_b(_t3, _t2); \
433 _out0 = __lsx_vilvl_w(_t6, _t4); \
434 _out2 = __lsx_vilvh_w(_t6, _t4); \
435 _out4 = __lsx_vilvl_w(_t7, _t5); \
436 _out6 = __lsx_vilvh_w(_t7, _t5); \
437 _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
438 _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
439 _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
440 _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
441 }
442
443 /*
444 * =============================================================================
445 * Description : Transpose 8x8 block with half-word elements in vectors
446 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
447 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
448 * Details :
449 * Example :
450 * 00,01,02,03,04,05,06,07 00,10,20,30,40,50,60,70
451 * 10,11,12,13,14,15,16,17 01,11,21,31,41,51,61,71
452 * 20,21,22,23,24,25,26,27 02,12,22,32,42,52,62,72
453 * 30,31,32,33,34,35,36,37 to 03,13,23,33,43,53,63,73
454 * 40,41,42,43,44,45,46,47 ======> 04,14,24,34,44,54,64,74
455 * 50,51,52,53,54,55,56,57 05,15,25,35,45,55,65,75
456 * 60,61,62,63,64,65,66,67 06,16,26,36,46,56,66,76
457 * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77
458 * =============================================================================
459 */
460 #define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
461 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
462 _out7) \
463 { \
464 __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
465 \
466 _s0 = __lsx_vilvl_h(_in6, _in4); \
467 _s1 = __lsx_vilvl_h(_in7, _in5); \
468 _t0 = __lsx_vilvl_h(_s1, _s0); \
469 _t1 = __lsx_vilvh_h(_s1, _s0); \
470 _s0 = __lsx_vilvh_h(_in6, _in4); \
471 _s1 = __lsx_vilvh_h(_in7, _in5); \
472 _t2 = __lsx_vilvl_h(_s1, _s0); \
473 _t3 = __lsx_vilvh_h(_s1, _s0); \
474 _s0 = __lsx_vilvl_h(_in2, _in0); \
475 _s1 = __lsx_vilvl_h(_in3, _in1); \
476 _t4 = __lsx_vilvl_h(_s1, _s0); \
477 _t5 = __lsx_vilvh_h(_s1, _s0); \
478 _s0 = __lsx_vilvh_h(_in2, _in0); \
479 _s1 = __lsx_vilvh_h(_in3, _in1); \
480 _t6 = __lsx_vilvl_h(_s1, _s0); \
481 _t7 = __lsx_vilvh_h(_s1, _s0); \
482 \
483 _out0 = __lsx_vpickev_d(_t0, _t4); \
484 _out2 = __lsx_vpickev_d(_t1, _t5); \
485 _out4 = __lsx_vpickev_d(_t2, _t6); \
486 _out6 = __lsx_vpickev_d(_t3, _t7); \
487 _out1 = __lsx_vpickod_d(_t0, _t4); \
488 _out3 = __lsx_vpickod_d(_t1, _t5); \
489 _out5 = __lsx_vpickod_d(_t2, _t6); \
490 _out7 = __lsx_vpickod_d(_t3, _t7); \
491 }
492
493 /*
494 * =============================================================================
495 * Description : Transpose input 8x4 byte block into 4x8
496 * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block)
497 * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block)
498 * Return Type - as per RTYPE
499 * Details : The rows of the matrix become columns, and the columns become
500 * rows.
501 * Example : LSX_TRANSPOSE8x4_B
502 * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
503 * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
504 * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
505 * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
506 * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
507 * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
508 * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
509 * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
510 *
511 * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
512 * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
513 * _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
514 * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
515 * =============================================================================
516 */
517 #define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
518 _out0, _out1, _out2, _out3) \
519 { \
520 __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
521 \
522 _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
523 _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
524 _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
525 _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
526 _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
527 \
528 _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
529 _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
530 _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
531 \
532 _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
533 _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
534 _out1 = __lsx_vilvh_d(_out2, _out0); \
535 _out3 = __lsx_vilvh_d(_out0, _out2); \
536 }
537
538 /*
539 * =============================================================================
540 * Description : Transpose 16x8 block with byte elements in vectors
541 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8
542 * in9, in10, in11, in12, in13, in14, in15
543 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
544 * Details :
545 * Example :
546 * 000,001,002,003,004,005,006,007
547 * 008,009,010,011,012,013,014,015
548 * 016,017,018,019,020,021,022,023
549 * 024,025,026,027,028,029,030,031
550 * 032,033,034,035,036,037,038,039
551 * 040,041,042,043,044,045,046,047 000,008,...,112,120
552 * 048,049,050,051,052,053,054,055 001,009,...,113,121
553 * 056,057,058,059,060,061,062,063 to 002,010,...,114,122
554 * 064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
555 * 072,073,074,075,076,077,078,079 004,012,...,116,124
556 * 080,081,082,083,084,085,086,087 005,013,...,117,125
557 * 088,089,090,091,092,093,094,095 006,014,...,118,126
558 * 096,097,098,099,100,101,102,103 007,015,...,119,127
559 * 104,105,106,107,108,109,110,111
560 * 112,113,114,115,116,117,118,119
561 * 120,121,122,123,124,125,126,127
562 * =============================================================================
563 */
564 #define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
565 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
566 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
567 _out6, _out7) \
568 { \
569 __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
570 __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
571 DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
572 _tmp0, _tmp1, _tmp2, _tmp3); \
573 DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
574 _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
575 DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
576 DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
577 DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
578 DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
579 DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
580 DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
581 DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
582 DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
583 DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
584 DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
585 DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
586 DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
587 }
588
589 /*
590 * =============================================================================
591 * Description : Butterfly of 4 input vectors
592 * Arguments : Inputs - in0, in1, in2, in3
593 * Outputs - out0, out1, out2, out3
594 * Details : Butterfly operation
595 * Example :
596 * out0 = in0 + in3;
597 * out1 = in1 + in2;
598 * out2 = in1 - in2;
599 * out3 = in0 - in3;
600 * =============================================================================
601 */
602 #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
603 { \
604 _out0 = __lsx_vadd_b(_in0, _in3); \
605 _out1 = __lsx_vadd_b(_in1, _in2); \
606 _out2 = __lsx_vsub_b(_in1, _in2); \
607 _out3 = __lsx_vsub_b(_in0, _in3); \
608 }
609 #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
610 { \
611 _out0 = __lsx_vadd_h(_in0, _in3); \
612 _out1 = __lsx_vadd_h(_in1, _in2); \
613 _out2 = __lsx_vsub_h(_in1, _in2); \
614 _out3 = __lsx_vsub_h(_in0, _in3); \
615 }
616 #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
617 { \
618 _out0 = __lsx_vadd_w(_in0, _in3); \
619 _out1 = __lsx_vadd_w(_in1, _in2); \
620 _out2 = __lsx_vsub_w(_in1, _in2); \
621 _out3 = __lsx_vsub_w(_in0, _in3); \
622 }
623 #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
624 { \
625 _out0 = __lsx_vadd_d(_in0, _in3); \
626 _out1 = __lsx_vadd_d(_in1, _in2); \
627 _out2 = __lsx_vsub_d(_in1, _in2); \
628 _out3 = __lsx_vsub_d(_in0, _in3); \
629 }
630
631 /*
632 * =============================================================================
633 * Description : Butterfly of 8 input vectors
634 * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
635 * Outputs - _out0, _out1, _out2, _out3, ~
636 * Details : Butterfly operation
637 * Example :
638 * _out0 = _in0 + _in7;
639 * _out1 = _in1 + _in6;
640 * _out2 = _in2 + _in5;
641 * _out3 = _in3 + _in4;
642 * _out4 = _in3 - _in4;
643 * _out5 = _in2 - _in5;
644 * _out6 = _in1 - _in6;
645 * _out7 = _in0 - _in7;
646 * =============================================================================
647 */
648 #define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
649 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
650 _out7) \
651 { \
652 _out0 = __lsx_vadd_b(_in0, _in7); \
653 _out1 = __lsx_vadd_b(_in1, _in6); \
654 _out2 = __lsx_vadd_b(_in2, _in5); \
655 _out3 = __lsx_vadd_b(_in3, _in4); \
656 _out4 = __lsx_vsub_b(_in3, _in4); \
657 _out5 = __lsx_vsub_b(_in2, _in5); \
658 _out6 = __lsx_vsub_b(_in1, _in6); \
659 _out7 = __lsx_vsub_b(_in0, _in7); \
660 }
661
662 #define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
663 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
664 _out7) \
665 { \
666 _out0 = __lsx_vadd_h(_in0, _in7); \
667 _out1 = __lsx_vadd_h(_in1, _in6); \
668 _out2 = __lsx_vadd_h(_in2, _in5); \
669 _out3 = __lsx_vadd_h(_in3, _in4); \
670 _out4 = __lsx_vsub_h(_in3, _in4); \
671 _out5 = __lsx_vsub_h(_in2, _in5); \
672 _out6 = __lsx_vsub_h(_in1, _in6); \
673 _out7 = __lsx_vsub_h(_in0, _in7); \
674 }
675
676 #define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
677 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
678 _out7) \
679 { \
680 _out0 = __lsx_vadd_w(_in0, _in7); \
681 _out1 = __lsx_vadd_w(_in1, _in6); \
682 _out2 = __lsx_vadd_w(_in2, _in5); \
683 _out3 = __lsx_vadd_w(_in3, _in4); \
684 _out4 = __lsx_vsub_w(_in3, _in4); \
685 _out5 = __lsx_vsub_w(_in2, _in5); \
686 _out6 = __lsx_vsub_w(_in1, _in6); \
687 _out7 = __lsx_vsub_w(_in0, _in7); \
688 }
689
690 #define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
691 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
692 _out7) \
693 { \
694 _out0 = __lsx_vadd_d(_in0, _in7); \
695 _out1 = __lsx_vadd_d(_in1, _in6); \
696 _out2 = __lsx_vadd_d(_in2, _in5); \
697 _out3 = __lsx_vadd_d(_in3, _in4); \
698 _out4 = __lsx_vsub_d(_in3, _in4); \
699 _out5 = __lsx_vsub_d(_in2, _in5); \
700 _out6 = __lsx_vsub_d(_in1, _in6); \
701 _out7 = __lsx_vsub_d(_in0, _in7); \
702 }
703
704 #endif // LSX
705
706 #ifdef __loongarch_asx
707 #include <lasxintrin.h>
708 /*
709 * =============================================================================
710 * Description : Dot product of byte vector elements
711 * Arguments : Inputs - in_h, in_l
712 * Output - out
713 * Return Type - signed halfword
714 * Details : Unsigned byte elements from in_h are multiplied with
715 * unsigned byte elements from in_l producing a result
716 * twice the size of input i.e. signed halfword.
717 * Then this multiplied results of adjacent odd-even elements
718 * are added to the out vector
719 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
720 * =============================================================================
721 */
__lasx_xvdp2_h_bu(__m256i in_h,__m256i in_l)722 static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
723 __m256i out;
724
725 out = __lasx_xvmulwev_h_bu(in_h, in_l);
726 out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
727 return out;
728 }
729
730 /*
731 * =============================================================================
732 * Description : Dot product of byte vector elements
733 * Arguments : Inputs - in_h, in_l
734 * Output - out
735 * Return Type - signed halfword
736 * Details : Signed byte elements from in_h are multiplied with
737 * signed byte elements from in_l producing a result
738 * twice the size of input i.e. signed halfword.
739 * Then this multiplication results of adjacent odd-even elements
740 * are added to the out vector
741 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
742 * =============================================================================
743 */
__lasx_xvdp2_h_b(__m256i in_h,__m256i in_l)744 static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
745 __m256i out;
746
747 out = __lasx_xvmulwev_h_b(in_h, in_l);
748 out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
749 return out;
750 }
751
752 /*
753 * =============================================================================
754 * Description : Dot product of halfword vector elements
755 * Arguments : Inputs - in_h, in_l
756 * Output - out
757 * Return Type - signed word
758 * Details : Signed halfword elements from in_h are multiplied with
759 * signed halfword elements from in_l producing a result
760 * twice the size of input i.e. signed word.
761 * Then this multiplied results of adjacent odd-even elements
762 * are added to the out vector.
763 * Example : out = __lasx_xvdp2_w_h(in_h, in_l)
764 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
765 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
766 * out : 22,38,38,22, 22,38,38,22
767 * =============================================================================
768 */
__lasx_xvdp2_w_h(__m256i in_h,__m256i in_l)769 static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
770 __m256i out;
771
772 out = __lasx_xvmulwev_w_h(in_h, in_l);
773 out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
774 return out;
775 }
776
777 /*
778 * =============================================================================
779 * Description : Dot product of word vector elements
780 * Arguments : Inputs - in_h, in_l
781 * Output - out
782 * Return Type - signed double
783 * Details : Signed word elements from in_h are multiplied with
784 * signed word elements from in_l producing a result
785 * twice the size of input i.e. signed double-word.
786 * Then this multiplied results of adjacent odd-even elements
787 * are added to the out vector.
788 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
789 * =============================================================================
790 */
__lasx_xvdp2_d_w(__m256i in_h,__m256i in_l)791 static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
792 __m256i out;
793
794 out = __lasx_xvmulwev_d_w(in_h, in_l);
795 out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
796 return out;
797 }
798
799 /*
800 * =============================================================================
801 * Description : Dot product of halfword vector elements
802 * Arguments : Inputs - in_h, in_l
803 * Output - out
804 * Return Type - signed word
805 * Details : Unsigned halfword elements from in_h are multiplied with
806 * signed halfword elements from in_l producing a result
807 * twice the size of input i.e. unsigned word.
808 * Multiplication result of adjacent odd-even elements
809 * are added to the out vector
810 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
811 * =============================================================================
812 */
__lasx_xvdp2_w_hu_h(__m256i in_h,__m256i in_l)813 static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
814 __m256i out;
815
816 out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
817 out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
818 return out;
819 }
820
821 /*
822 * =============================================================================
823 * Description : Dot product & addition of byte vector elements
824 * Arguments : Inputs - in_h, in_l
825 * Output - out
826 * Return Type - halfword
827 * Details : Signed byte elements from in_h are multiplied with
828 * signed byte elements from in_l producing a result
829 * twice the size of input i.e. signed halfword.
830 * Then this multiplied results of adjacent odd-even elements
831 * are added to the in_c vector.
832 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
833 * =============================================================================
834 */
__lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h,__m256i in_l)835 static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,
836 __m256i in_h,
837 __m256i in_l) {
838 __m256i out;
839
840 out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
841 out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
842 return out;
843 }
844
845 /*
846 * =============================================================================
847 * Description : Dot product & addition of byte vector elements
848 * Arguments : Inputs - in_h, in_l
849 * Output - out
850 * Return Type - halfword
851 * Details : Unsigned byte elements from in_h are multiplied with
852 * unsigned byte elements from in_l producing a result
853 * twice the size of input i.e. signed halfword.
854 * Then this multiplied results of adjacent odd-even elements
855 * are added to the in_c vector.
856 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
857 * =============================================================================
858 */
__lasx_xvdp2add_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)859 static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c,
860 __m256i in_h,
861 __m256i in_l) {
862 __m256i out;
863
864 out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
865 out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
866 return out;
867 }
868
869 /*
870 * =============================================================================
871 * Description : Dot product & addition of byte vector elements
872 * Arguments : Inputs - in_h, in_l
873 * Output - out
874 * Return Type - halfword
875 * Details : Unsigned byte elements from in_h are multiplied with
876 * signed byte elements from in_l producing a result
877 * twice the size of input i.e. signed halfword.
878 * Then this multiplied results of adjacent odd-even elements
879 * are added to the in_c vector.
880 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
881 * =============================================================================
882 */
__lasx_xvdp2add_h_bu_b(__m256i in_c,__m256i in_h,__m256i in_l)883 static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c,
884 __m256i in_h,
885 __m256i in_l) {
886 __m256i out;
887
888 out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
889 out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
890 return out;
891 }
892
893 /*
894 * =============================================================================
895 * Description : Dot product of halfword vector elements
896 * Arguments : Inputs - in_c, in_h, in_l
897 * Output - out
898 * Return Type - per RTYPE
899 * Details : Signed halfword elements from in_h are multiplied with
900 * signed halfword elements from in_l producing a result
901 * twice the size of input i.e. signed word.
902 * Multiplication result of adjacent odd-even elements
903 * are added to the in_c vector.
904 * Example : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
905 * in_c : 1,2,3,4, 1,2,3,4
906 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
907 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
908 * out : 23,40,41,26, 23,40,41,26
909 * =============================================================================
910 */
__lasx_xvdp2add_w_h(__m256i in_c,__m256i in_h,__m256i in_l)911 static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c,
912 __m256i in_h,
913 __m256i in_l) {
914 __m256i out;
915
916 out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
917 out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
918 return out;
919 }
920
921 /*
922 * =============================================================================
923 * Description : Dot product of halfword vector elements
924 * Arguments : Inputs - in_c, in_h, in_l
925 * Output - out
926 * Return Type - signed word
927 * Details : Unsigned halfword elements from in_h are multiplied with
928 * unsigned halfword elements from in_l producing a result
929 * twice the size of input i.e. signed word.
930 * Multiplication result of adjacent odd-even elements
931 * are added to the in_c vector.
932 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
933 * =============================================================================
934 */
__lasx_xvdp2add_w_hu(__m256i in_c,__m256i in_h,__m256i in_l)935 static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c,
936 __m256i in_h,
937 __m256i in_l) {
938 __m256i out;
939
940 out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
941 out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
942 return out;
943 }
944
945 /*
946 * =============================================================================
947 * Description : Dot product of halfword vector elements
948 * Arguments : Inputs - in_c, in_h, in_l
949 * Output - out
950 * Return Type - signed word
951 * Details : Unsigned halfword elements from in_h are multiplied with
952 * signed halfword elements from in_l producing a result
953 * twice the size of input i.e. signed word.
954 * Multiplication result of adjacent odd-even elements
955 * are added to the in_c vector
956 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
957 * =============================================================================
958 */
__lasx_xvdp2add_w_hu_h(__m256i in_c,__m256i in_h,__m256i in_l)959 static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c,
960 __m256i in_h,
961 __m256i in_l) {
962 __m256i out;
963
964 out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
965 out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
966 return out;
967 }
968
969 /*
970 * =============================================================================
971 * Description : Vector Unsigned Dot Product and Subtract
972 * Arguments : Inputs - in_c, in_h, in_l
973 * Output - out
974 * Return Type - signed halfword
975 * Details : Unsigned byte elements from in_h are multiplied with
976 * unsigned byte elements from in_l producing a result
977 * twice the size of input i.e. signed halfword.
978 * Multiplication result of adjacent odd-even elements
979 * are added together and subtracted from double width elements
980 * in_c vector.
981 * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
982 * =============================================================================
983 */
__lasx_xvdp2sub_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)984 static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c,
985 __m256i in_h,
986 __m256i in_l) {
987 __m256i out;
988
989 out = __lasx_xvmulwev_h_bu(in_h, in_l);
990 out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
991 out = __lasx_xvsub_h(in_c, out);
992 return out;
993 }
994
995 /*
996 * =============================================================================
997 * Description : Vector Signed Dot Product and Subtract
998 * Arguments : Inputs - in_c, in_h, in_l
999 * Output - out
1000 * Return Type - signed word
1001 * Details : Signed halfword elements from in_h are multiplied with
1002 * Signed halfword elements from in_l producing a result
1003 * twice the size of input i.e. signed word.
1004 * Multiplication result of adjacent odd-even elements
1005 * are added together and subtracted from double width elements
1006 * in_c vector.
1007 * Example : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
1008 * in_c : 0,0,0,0, 0,0,0,0
1009 * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
1010 * in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
1011 * out : -7,-3,0,0, 0,-1,0,-1
1012 * =============================================================================
1013 */
__lasx_xvdp2sub_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1014 static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c,
1015 __m256i in_h,
1016 __m256i in_l) {
1017 __m256i out;
1018
1019 out = __lasx_xvmulwev_w_h(in_h, in_l);
1020 out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1021 out = __lasx_xvsub_w(in_c, out);
1022 return out;
1023 }
1024
1025 /*
1026 * =============================================================================
1027 * Description : Dot product of halfword vector elements
1028 * Arguments : Inputs - in_h, in_l
1029 * Output - out
1030 * Return Type - signed word
1031 * Details : Signed halfword elements from in_h are multiplied with
1032 * signed halfword elements from in_l producing a result
1033 * four times the size of input i.e. signed doubleword.
1034 * Then this multiplication results of four adjacent elements
1035 * are added together and stored to the out vector.
1036 * Example : out = __lasx_xvdp4_d_h(in_h, in_l)
1037 * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
1038 * in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
1039 * out : -2,0,1,1
1040 * =============================================================================
1041 */
__lasx_xvdp4_d_h(__m256i in_h,__m256i in_l)1042 static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
1043 __m256i out;
1044
1045 out = __lasx_xvmulwev_w_h(in_h, in_l);
1046 out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1047 out = __lasx_xvhaddw_d_w(out, out);
1048 return out;
1049 }
1050
1051 /*
1052 * =============================================================================
1053 * Description : The high half of the vector elements are expanded and
1054 * added after being doubled.
1055 * Arguments : Inputs - in_h, in_l
1056 * Output - out
1057 * Details : The in_h vector and the in_l vector are added after the
1058 * higher half of the two-fold sign extension (signed byte
1059 * to signed halfword) and stored to the out vector.
1060 * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l)
1061 * =============================================================================
1062 */
__lasx_xvaddwh_h_b(__m256i in_h,__m256i in_l)1063 static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
1064 __m256i out;
1065
1066 out = __lasx_xvilvh_b(in_h, in_l);
1067 out = __lasx_xvhaddw_h_b(out, out);
1068 return out;
1069 }
1070
1071 /*
1072 * =============================================================================
1073 * Description : The high half of the vector elements are expanded and
1074 * added after being doubled.
1075 * Arguments : Inputs - in_h, in_l
1076 * Output - out
1077 * Details : The in_h vector and the in_l vector are added after the
1078 * higher half of the two-fold sign extension (signed halfword
1079 * to signed word) and stored to the out vector.
1080 * Example : out = __lasx_xvaddwh_w_h(in_h, in_l)
1081 * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1082 * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1083 * out : 1,0,0,-1, 1,0,0, 2
1084 * =============================================================================
1085 */
__lasx_xvaddwh_w_h(__m256i in_h,__m256i in_l)1086 static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
1087 __m256i out;
1088
1089 out = __lasx_xvilvh_h(in_h, in_l);
1090 out = __lasx_xvhaddw_w_h(out, out);
1091 return out;
1092 }
1093
1094 /*
1095 * =============================================================================
1096 * Description : The low half of the vector elements are expanded and
1097 * added after being doubled.
1098 * Arguments : Inputs - in_h, in_l
1099 * Output - out
1100 * Details : The in_h vector and the in_l vector are added after the
1101 * lower half of the two-fold sign extension (signed byte
1102 * to signed halfword) and stored to the out vector.
1103 * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1104 * =============================================================================
1105 */
__lasx_xvaddwl_h_b(__m256i in_h,__m256i in_l)1106 static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
1107 __m256i out;
1108
1109 out = __lasx_xvilvl_b(in_h, in_l);
1110 out = __lasx_xvhaddw_h_b(out, out);
1111 return out;
1112 }
1113
1114 /*
1115 * =============================================================================
1116 * Description : The low half of the vector elements are expanded and
1117 * added after being doubled.
1118 * Arguments : Inputs - in_h, in_l
1119 * Output - out
1120 * Details : The in_h vector and the in_l vector are added after the
1121 * lower half of the two-fold sign extension (signed halfword
1122 * to signed word) and stored to the out vector.
1123 * Example : out = __lasx_xvaddwl_w_h(in_h, in_l)
1124 * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1125 * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1126 * out : 5,-1,4,2, 1,0,2,-1
1127 * =============================================================================
1128 */
__lasx_xvaddwl_w_h(__m256i in_h,__m256i in_l)1129 static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
1130 __m256i out;
1131
1132 out = __lasx_xvilvl_h(in_h, in_l);
1133 out = __lasx_xvhaddw_w_h(out, out);
1134 return out;
1135 }
1136
1137 /*
1138 * =============================================================================
1139 * Description : The low half of the vector elements are expanded and
1140 * added after being doubled.
1141 * Arguments : Inputs - in_h, in_l
1142 * Output - out
1143 * Details : The out vector and the out vector are added after the
1144 * lower half of the two-fold zero extension (unsigned byte
1145 * to unsigned halfword) and stored to the out vector.
1146 * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1147 * =============================================================================
1148 */
__lasx_xvaddwl_h_bu(__m256i in_h,__m256i in_l)1149 static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
1150 __m256i out;
1151
1152 out = __lasx_xvilvl_b(in_h, in_l);
1153 out = __lasx_xvhaddw_hu_bu(out, out);
1154 return out;
1155 }
1156
1157 /*
1158 * =============================================================================
1159 * Description : The low half of the vector elements are expanded and
1160 * added after being doubled.
1161 * Arguments : Inputs - in_h, in_l
1162 * Output - out
1163 * Details : The in_l vector after double zero extension (unsigned byte to
1164 * signed halfword),added to the in_h vector.
1165 * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
1166 * =============================================================================
1167 */
__lasx_xvaddw_h_h_bu(__m256i in_h,__m256i in_l)1168 static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
1169 __m256i out;
1170
1171 out = __lasx_xvsllwil_hu_bu(in_l, 0);
1172 out = __lasx_xvadd_h(in_h, out);
1173 return out;
1174 }
1175
1176 /*
1177 * =============================================================================
1178 * Description : The low half of the vector elements are expanded and
1179 * added after being doubled.
1180 * Arguments : Inputs - in_h, in_l
1181 * Output - out
1182 * Details : The in_l vector after double sign extension (signed halfword to
1183 * signed word), added to the in_h vector.
1184 * Example : out = __lasx_xvaddw_w_w_h(in_h, in_l)
1185 * in_h : 0, 1,0,0, -1,0,0,1,
1186 * in_l : 2,-1,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1,
1187 * out : 2, 0,1,2, -1,0,1,1,
1188 * =============================================================================
1189 */
__lasx_xvaddw_w_w_h(__m256i in_h,__m256i in_l)1190 static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
1191 __m256i out;
1192
1193 out = __lasx_xvsllwil_w_h(in_l, 0);
1194 out = __lasx_xvadd_w(in_h, out);
1195 return out;
1196 }
1197
1198 /*
1199 * =============================================================================
1200 * Description : Multiplication and addition calculation after expansion
1201 * of the lower half of the vector.
1202 * Arguments : Inputs - in_c, in_h, in_l
1203 * Output - out
1204 * Details : The in_h vector and the in_l vector are multiplied after
1205 * the lower half of the two-fold sign extension (signed halfword
1206 * to signed word), and the result is added to the vector in_c,
1207 * then stored to the out vector.
1208 * Example : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1209 * in_c : 1,2,3,4, 5,6,7,8
1210 * in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
1211 * in_l : 200, 300, 400, 500, 2000, 3000, 4000, 5000,
1212 * -200,-300,-400,-500, -2000,-3000,-4000,-5000
1213 * out : 201, 602,1203,2004, -995, -1794,-2793,-3992
1214 * =============================================================================
1215 */
__lasx_xvmaddwl_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1216 static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c,
1217 __m256i in_h,
1218 __m256i in_l) {
1219 __m256i tmp0, tmp1, out;
1220
1221 tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1222 tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1223 tmp0 = __lasx_xvmul_w(tmp0, tmp1);
1224 out = __lasx_xvadd_w(tmp0, in_c);
1225 return out;
1226 }
1227
1228 /*
1229 * =============================================================================
1230 * Description : Multiplication and addition calculation after expansion
1231 * of the higher half of the vector.
1232 * Arguments : Inputs - in_c, in_h, in_l
1233 * Output - out
1234 * Details : The in_h vector and the in_l vector are multiplied after
1235 * the higher half of the two-fold sign extension (signed
1236 * halfword to signed word), and the result is added to
1237 * the vector in_c, then stored to the out vector.
1238 * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1239 * =============================================================================
1240 */
__lasx_xvmaddwh_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1241 static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c,
1242 __m256i in_h,
1243 __m256i in_l) {
1244 __m256i tmp0, tmp1, out;
1245
1246 tmp0 = __lasx_xvilvh_h(in_h, in_h);
1247 tmp1 = __lasx_xvilvh_h(in_l, in_l);
1248 tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
1249 out = __lasx_xvadd_w(tmp0, in_c);
1250 return out;
1251 }
1252
1253 /*
1254 * =============================================================================
1255 * Description : Multiplication calculation after expansion of the lower
1256 * half of the vector.
1257 * Arguments : Inputs - in_h, in_l
1258 * Output - out
1259 * Details : The in_h vector and the in_l vector are multiplied after
1260 * the lower half of the two-fold sign extension (signed
1261 * halfword to signed word), then stored to the out vector.
1262 * Example : out = __lasx_xvmulwl_w_h(in_h, in_l)
1263 * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1264 * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1265 * out : 6,1,3,0, 0,0,1,0
1266 * =============================================================================
1267 */
__lasx_xvmulwl_w_h(__m256i in_h,__m256i in_l)1268 static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
1269 __m256i tmp0, tmp1, out;
1270
1271 tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1272 tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1273 out = __lasx_xvmul_w(tmp0, tmp1);
1274 return out;
1275 }
1276
1277 /*
1278 * =============================================================================
1279 * Description : Multiplication calculation after expansion of the lower
1280 * half of the vector.
1281 * Arguments : Inputs - in_h, in_l
1282 * Output - out
1283 * Details : The in_h vector and the in_l vector are multiplied after
1284 * the lower half of the two-fold sign extension (signed
1285 * halfword to signed word), then stored to the out vector.
1286 * Example : out = __lasx_xvmulwh_w_h(in_h, in_l)
1287 * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1288 * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1289 * out : 0,0,0,0, 0,0,0,1
1290 * =============================================================================
1291 */
__lasx_xvmulwh_w_h(__m256i in_h,__m256i in_l)1292 static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
1293 __m256i tmp0, tmp1, out;
1294
1295 tmp0 = __lasx_xvilvh_h(in_h, in_h);
1296 tmp1 = __lasx_xvilvh_h(in_l, in_l);
1297 out = __lasx_xvmulwev_w_h(tmp0, tmp1);
1298 return out;
1299 }
1300
1301 /*
1302 * =============================================================================
1303 * Description : The low half of the vector elements are added to the high half
1304 * after being doubled, then saturated.
1305 * Arguments : Inputs - in_h, in_l
1306 * Output - out
1307 * Details : The in_h vector adds the in_l vector after the lower half of
1308 * the two-fold zero extension (unsigned byte to unsigned
1309 * halfword) and then saturated. The results are stored to the out
1310 * vector.
1311 * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
1312 * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
1313 * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
1314 * 0,0,0,1
1315 * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
1316 * =============================================================================
1317 */
__lasx_xvsaddw_hu_hu_bu(__m256i in_h,__m256i in_l)1318 static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
1319 __m256i tmp1, out;
1320 __m256i zero = {0};
1321
1322 tmp1 = __lasx_xvilvl_b(zero, in_l);
1323 out = __lasx_xvsadd_hu(in_h, tmp1);
1324 return out;
1325 }
1326
1327 /*
1328 * =============================================================================
1329 * Description : Clip all halfword elements of input vector between min & max
1330 * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1331 * Arguments : Inputs - in (input vector)
1332 * - min (min threshold)
1333 * - max (max threshold)
1334 * Outputs - in (output vector with clipped elements)
1335 * Return Type - signed halfword
1336 * Example : out = __lasx_xvclip_h(in, min, max)
1337 * in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
1338 * min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
1339 * max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
1340 * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
1341 * =============================================================================
1342 */
__lasx_xvclip_h(__m256i in,__m256i min,__m256i max)1343 static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
1344 __m256i out;
1345
1346 out = __lasx_xvmax_h(min, in);
1347 out = __lasx_xvmin_h(max, out);
1348 return out;
1349 }
1350
1351 /*
1352 * =============================================================================
1353 * Description : Clip all signed halfword elements of input vector
1354 * between 0 & 255
1355 * Arguments : Inputs - in (input vector)
1356 * Outputs - out (output vector with clipped elements)
1357 * Return Type - signed halfword
1358 * Example : See out = __lasx_xvclip255_w(in)
1359 * =============================================================================
1360 */
__lasx_xvclip255_h(__m256i in)1361 static inline __m256i __lasx_xvclip255_h(__m256i in) {
1362 __m256i out;
1363
1364 out = __lasx_xvmaxi_h(in, 0);
1365 out = __lasx_xvsat_hu(out, 7);
1366 return out;
1367 }
1368
1369 /*
1370 * =============================================================================
1371 * Description : Clip all signed word elements of input vector
1372 * between 0 & 255
1373 * Arguments : Inputs - in (input vector)
1374 * Output - out (output vector with clipped elements)
1375 * Return Type - signed word
1376 * Example : out = __lasx_xvclip255_w(in)
1377 * in : -8,255,280,249, -8,255,280,249
1378 * out : 0,255,255,249, 0,255,255,249
1379 * =============================================================================
1380 */
__lasx_xvclip255_w(__m256i in)1381 static inline __m256i __lasx_xvclip255_w(__m256i in) {
1382 __m256i out;
1383
1384 out = __lasx_xvmaxi_w(in, 0);
1385 out = __lasx_xvsat_wu(out, 7);
1386 return out;
1387 }
1388
1389 /*
1390 * =============================================================================
1391 * Description : Indexed halfword element values are replicated to all
1392 * elements in output vector. If 'idx < 8' use xvsplati_l_*,
1393 * if 'idx >= 8' use xvsplati_h_*.
1394 * Arguments : Inputs - in, idx
1395 * Output - out
1396 * Details : Idx element value from in vector is replicated to all
1397 * elements in out vector.
1398 * Valid index range for halfword operation is 0-7
1399 * Example : out = __lasx_xvsplati_l_h(in, idx)
1400 * in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
1401 * idx : 0x02
1402 * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
1403 * =============================================================================
1404 */
__lasx_xvsplati_l_h(__m256i in,int idx)1405 static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
1406 __m256i out;
1407
1408 out = __lasx_xvpermi_q(in, in, 0x02);
1409 out = __lasx_xvreplve_h(out, idx);
1410 return out;
1411 }
1412
1413 /*
1414 * =============================================================================
1415 * Description : Indexed halfword element values are replicated to all
1416 * elements in output vector. If 'idx < 8' use xvsplati_l_*,
1417 * if 'idx >= 8' use xvsplati_h_*.
1418 * Arguments : Inputs - in, idx
1419 * Output - out
1420 * Details : Idx element value from in vector is replicated to all
1421 * elements in out vector.
1422 * Valid index range for halfword operation is 0-7
1423 * Example : out = __lasx_xvsplati_h_h(in, idx)
1424 * in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
1425 * idx : 0x09
1426 * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1427 * =============================================================================
1428 */
__lasx_xvsplati_h_h(__m256i in,int idx)1429 static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
1430 __m256i out;
1431
1432 out = __lasx_xvpermi_q(in, in, 0x13);
1433 out = __lasx_xvreplve_h(out, idx);
1434 return out;
1435 }
1436
1437 /*
1438 * =============================================================================
1439 * Description : Transpose 4x4 block with double-word elements in vectors
1440 * Arguments : Inputs - _in0, _in1, _in2, _in3
1441 * Outputs - _out0, _out1, _out2, _out3
1442 * Example : LASX_TRANSPOSE4x4_D
1443 * _in0 : 1,2,3,4
1444 * _in1 : 1,2,3,4
1445 * _in2 : 1,2,3,4
1446 * _in3 : 1,2,3,4
1447 *
1448 * _out0 : 1,1,1,1
1449 * _out1 : 2,2,2,2
1450 * _out2 : 3,3,3,3
1451 * _out3 : 4,4,4,4
1452 * =============================================================================
1453 */
1454 #define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1455 _out3) \
1456 { \
1457 __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
1458 _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
1459 _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
1460 _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
1461 _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
1462 _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
1463 _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
1464 _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
1465 _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
1466 }
1467
1468 /*
1469 * =============================================================================
1470 * Description : Transpose 8x8 block with word elements in vectors
1471 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1472 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1473 * _out7
1474 * Example : LASX_TRANSPOSE8x8_W
1475 * _in0 : 1,2,3,4,5,6,7,8
1476 * _in1 : 2,2,3,4,5,6,7,8
1477 * _in2 : 3,2,3,4,5,6,7,8
1478 * _in3 : 4,2,3,4,5,6,7,8
1479 * _in4 : 5,2,3,4,5,6,7,8
1480 * _in5 : 6,2,3,4,5,6,7,8
1481 * _in6 : 7,2,3,4,5,6,7,8
1482 * _in7 : 8,2,3,4,5,6,7,8
1483 *
1484 * _out0 : 1,2,3,4,5,6,7,8
1485 * _out1 : 2,2,2,2,2,2,2,2
1486 * _out2 : 3,3,3,3,3,3,3,3
1487 * _out3 : 4,4,4,4,4,4,4,4
1488 * _out4 : 5,5,5,5,5,5,5,5
1489 * _out5 : 6,6,6,6,6,6,6,6
1490 * _out6 : 7,7,7,7,7,7,7,7
1491 * _out7 : 8,8,8,8,8,8,8,8
1492 * =============================================================================
1493 */
1494 #define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1495 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1496 _out7) \
1497 { \
1498 __m256i _s0_m, _s1_m; \
1499 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1500 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1501 \
1502 _s0_m = __lasx_xvilvl_w(_in2, _in0); \
1503 _s1_m = __lasx_xvilvl_w(_in3, _in1); \
1504 _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1505 _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1506 _s0_m = __lasx_xvilvh_w(_in2, _in0); \
1507 _s1_m = __lasx_xvilvh_w(_in3, _in1); \
1508 _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1509 _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1510 _s0_m = __lasx_xvilvl_w(_in6, _in4); \
1511 _s1_m = __lasx_xvilvl_w(_in7, _in5); \
1512 _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1513 _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1514 _s0_m = __lasx_xvilvh_w(_in6, _in4); \
1515 _s1_m = __lasx_xvilvh_w(_in7, _in5); \
1516 _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1517 _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1518 _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
1519 _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
1520 _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
1521 _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
1522 _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
1523 _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
1524 _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
1525 _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
1526 }
1527
1528 /*
1529 * =============================================================================
1530 * Description : Transpose input 16x8 byte block
1531 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1532 * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1533 * (input 16x8 byte block)
1534 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1535 * _out7 (output 8x16 byte block)
1536 * Details : The rows of the matrix become columns, and the columns become
1537 * rows.
1538 * Example : See LASX_TRANSPOSE16x8_H
1539 * =============================================================================
1540 */
1541 #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1542 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
1543 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1544 _out6, _out7) \
1545 { \
1546 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1547 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1548 \
1549 _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1550 _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1551 _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1552 _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1553 _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
1554 _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
1555 _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
1556 _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
1557 _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1558 _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1559 _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1560 _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1561 _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
1562 _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
1563 _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
1564 _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
1565 _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
1566 _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
1567 _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
1568 _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
1569 _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
1570 _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
1571 _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
1572 _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
1573 _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
1574 _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
1575 _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
1576 _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
1577 _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
1578 _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
1579 _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
1580 _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
1581 }
1582
1583 /*
1584 * =============================================================================
1585 * Description : Transpose input 16x8 byte block
1586 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1587 * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1588 * (input 16x8 byte block)
1589 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1590 * _out7 (output 8x16 byte block)
1591 * Details : The rows of the matrix become columns, and the columns become
1592 * rows.
1593 * Example : LASX_TRANSPOSE16x8_H
1594 * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1595 * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1596 * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1597 * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1598 * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1599 * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1600 * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1601 * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1602 * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1603 * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1604 * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1605 * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1606 * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1607 * _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1608 * _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1609 * _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1610 *
1611 * _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
1612 * _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1613 * _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
1614 * _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
1615 * _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
1616 * _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1617 * _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
1618 * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
1619 * =============================================================================
1620 */
1621 #define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1622 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
1623 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1624 _out6, _out7) \
1625 { \
1626 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1627 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1628 __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
1629 \
1630 _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
1631 _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
1632 _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
1633 _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
1634 _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
1635 _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
1636 _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
1637 _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
1638 _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1639 _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1640 _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1641 _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1642 _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1643 _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1644 _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1645 _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1646 _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1647 _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1648 _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1649 _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1650 _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1651 _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1652 _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1653 _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1654 _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1655 _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1656 _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1657 _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1658 \
1659 _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
1660 _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
1661 _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
1662 _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
1663 _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
1664 _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
1665 _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
1666 _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
1667 _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1668 _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1669 _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1670 _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1671 _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1672 _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1673 _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1674 _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1675 _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1676 _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1677 _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1678 _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1679 _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1680 _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1681 _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1682 _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1683 _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1684 _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1685 _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1686 _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1687 }
1688
1689 /*
1690 * =============================================================================
1691 * Description : Transpose 4x4 block with halfword elements in vectors
1692 * Arguments : Inputs - _in0, _in1, _in2, _in3
1693 * Outputs - _out0, _out1, _out2, _out3
1694 * Return Type - signed halfword
1695 * Details : The rows of the matrix become columns, and the columns become
1696 * rows.
1697 * Example : See LASX_TRANSPOSE8x8_H
1698 * =============================================================================
1699 */
1700 #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1701 _out3) \
1702 { \
1703 __m256i _s0_m, _s1_m; \
1704 \
1705 _s0_m = __lasx_xvilvl_h(_in1, _in0); \
1706 _s1_m = __lasx_xvilvl_h(_in3, _in2); \
1707 _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
1708 _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
1709 _out1 = __lasx_xvilvh_d(_out0, _out0); \
1710 _out3 = __lasx_xvilvh_d(_out2, _out2); \
1711 }
1712
1713 /*
1714 * =============================================================================
1715 * Description : Transpose input 8x8 byte block
1716 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1717 * (input 8x8 byte block)
1718 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1719 * _out7 (output 8x8 byte block)
1720 * Example : See LASX_TRANSPOSE8x8_H
1721 * =============================================================================
1722 */
1723 #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1724 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1725 _out7) \
1726 { \
1727 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1728 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1729 _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1730 _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1731 _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1732 _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1733 _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1734 _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1735 _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1736 _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1737 _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
1738 _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
1739 _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
1740 _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
1741 _out1 = __lasx_xvbsrl_v(_out0, 8); \
1742 _out3 = __lasx_xvbsrl_v(_out2, 8); \
1743 _out5 = __lasx_xvbsrl_v(_out4, 8); \
1744 _out7 = __lasx_xvbsrl_v(_out6, 8); \
1745 }
1746
1747 /*
1748 * =============================================================================
1749 * Description : Transpose 8x8 block with halfword elements in vectors.
1750 * Arguments : Inputs - _in0, _in1, ~
1751 * Outputs - _out0, _out1, ~
1752 * Details : The rows of the matrix become columns, and the columns become
1753 * rows.
1754 * Example : LASX_TRANSPOSE8x8_H
1755 * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1756 * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1757 * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1758 * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1759 * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1760 * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1761 * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1762 * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1763 *
1764 * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
1765 * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1766 * _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
1767 * _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
1768 * _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
1769 * _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
1770 * _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
1771 * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
1772 * =============================================================================
1773 */
1774 #define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1775 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1776 _out7) \
1777 { \
1778 __m256i _s0_m, _s1_m; \
1779 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1780 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1781 \
1782 _s0_m = __lasx_xvilvl_h(_in6, _in4); \
1783 _s1_m = __lasx_xvilvl_h(_in7, _in5); \
1784 _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1785 _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1786 _s0_m = __lasx_xvilvh_h(_in6, _in4); \
1787 _s1_m = __lasx_xvilvh_h(_in7, _in5); \
1788 _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1789 _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1790 \
1791 _s0_m = __lasx_xvilvl_h(_in2, _in0); \
1792 _s1_m = __lasx_xvilvl_h(_in3, _in1); \
1793 _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1794 _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1795 _s0_m = __lasx_xvilvh_h(_in2, _in0); \
1796 _s1_m = __lasx_xvilvh_h(_in3, _in1); \
1797 _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1798 _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1799 \
1800 _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
1801 _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
1802 _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
1803 _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
1804 _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
1805 _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
1806 _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
1807 _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
1808 }
1809
1810 /*
1811 * =============================================================================
1812 * Description : Butterfly of 4 input vectors
1813 * Arguments : Inputs - _in0, _in1, _in2, _in3
1814 * Outputs - _out0, _out1, _out2, _out3
1815 * Details : Butterfly operation
1816 * Example : LASX_BUTTERFLY_4
1817 * _out0 = _in0 + _in3;
1818 * _out1 = _in1 + _in2;
1819 * _out2 = _in1 - _in2;
1820 * _out3 = _in0 - _in3;
1821 * =============================================================================
1822 */
1823 #define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1824 { \
1825 _out0 = __lasx_xvadd_b(_in0, _in3); \
1826 _out1 = __lasx_xvadd_b(_in1, _in2); \
1827 _out2 = __lasx_xvsub_b(_in1, _in2); \
1828 _out3 = __lasx_xvsub_b(_in0, _in3); \
1829 }
1830 #define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1831 { \
1832 _out0 = __lasx_xvadd_h(_in0, _in3); \
1833 _out1 = __lasx_xvadd_h(_in1, _in2); \
1834 _out2 = __lasx_xvsub_h(_in1, _in2); \
1835 _out3 = __lasx_xvsub_h(_in0, _in3); \
1836 }
1837 #define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1838 { \
1839 _out0 = __lasx_xvadd_w(_in0, _in3); \
1840 _out1 = __lasx_xvadd_w(_in1, _in2); \
1841 _out2 = __lasx_xvsub_w(_in1, _in2); \
1842 _out3 = __lasx_xvsub_w(_in0, _in3); \
1843 }
1844 #define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1845 { \
1846 _out0 = __lasx_xvadd_d(_in0, _in3); \
1847 _out1 = __lasx_xvadd_d(_in1, _in2); \
1848 _out2 = __lasx_xvsub_d(_in1, _in2); \
1849 _out3 = __lasx_xvsub_d(_in0, _in3); \
1850 }
1851
1852 /*
1853 * =============================================================================
1854 * Description : Butterfly of 8 input vectors
1855 * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
1856 * Outputs - _out0, _out1, _out2, _out3, ~
1857 * Details : Butterfly operation
1858 * Example : LASX_BUTTERFLY_8
1859 * _out0 = _in0 + _in7;
1860 * _out1 = _in1 + _in6;
1861 * _out2 = _in2 + _in5;
1862 * _out3 = _in3 + _in4;
1863 * _out4 = _in3 - _in4;
1864 * _out5 = _in2 - _in5;
1865 * _out6 = _in1 - _in6;
1866 * _out7 = _in0 - _in7;
1867 * =============================================================================
1868 */
1869 #define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1870 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1871 _out7) \
1872 { \
1873 _out0 = __lasx_xvadd_b(_in0, _in7); \
1874 _out1 = __lasx_xvadd_b(_in1, _in6); \
1875 _out2 = __lasx_xvadd_b(_in2, _in5); \
1876 _out3 = __lasx_xvadd_b(_in3, _in4); \
1877 _out4 = __lasx_xvsub_b(_in3, _in4); \
1878 _out5 = __lasx_xvsub_b(_in2, _in5); \
1879 _out6 = __lasx_xvsub_b(_in1, _in6); \
1880 _out7 = __lasx_xvsub_b(_in0, _in7); \
1881 }
1882
1883 #define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1884 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1885 _out7) \
1886 { \
1887 _out0 = __lasx_xvadd_h(_in0, _in7); \
1888 _out1 = __lasx_xvadd_h(_in1, _in6); \
1889 _out2 = __lasx_xvadd_h(_in2, _in5); \
1890 _out3 = __lasx_xvadd_h(_in3, _in4); \
1891 _out4 = __lasx_xvsub_h(_in3, _in4); \
1892 _out5 = __lasx_xvsub_h(_in2, _in5); \
1893 _out6 = __lasx_xvsub_h(_in1, _in6); \
1894 _out7 = __lasx_xvsub_h(_in0, _in7); \
1895 }
1896
1897 #define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1898 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1899 _out7) \
1900 { \
1901 _out0 = __lasx_xvadd_w(_in0, _in7); \
1902 _out1 = __lasx_xvadd_w(_in1, _in6); \
1903 _out2 = __lasx_xvadd_w(_in2, _in5); \
1904 _out3 = __lasx_xvadd_w(_in3, _in4); \
1905 _out4 = __lasx_xvsub_w(_in3, _in4); \
1906 _out5 = __lasx_xvsub_w(_in2, _in5); \
1907 _out6 = __lasx_xvsub_w(_in1, _in6); \
1908 _out7 = __lasx_xvsub_w(_in0, _in7); \
1909 }
1910
1911 #define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1912 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1913 _out7) \
1914 { \
1915 _out0 = __lasx_xvadd_d(_in0, _in7); \
1916 _out1 = __lasx_xvadd_d(_in1, _in6); \
1917 _out2 = __lasx_xvadd_d(_in2, _in5); \
1918 _out3 = __lasx_xvadd_d(_in3, _in4); \
1919 _out4 = __lasx_xvsub_d(_in3, _in4); \
1920 _out5 = __lasx_xvsub_d(_in2, _in5); \
1921 _out6 = __lasx_xvsub_d(_in1, _in6); \
1922 _out7 = __lasx_xvsub_d(_in0, _in7); \
1923 }
1924
1925 #endif // LASX
1926
1927 /*
1928 * =============================================================================
1929 * Description : Print out elements in vector.
1930 * Arguments : Inputs - RTYPE, _element_num, _in0, _enter
1931 * Outputs -
1932 * Details : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
1933 * '_enter' is TRUE, prefix "\nVP:" will be added first.
1934 * Example : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
1935 * VP:1,2,3,4,
1936 * =============================================================================
1937 */
1938 #define VECT_PRINT(RTYPE, element_num, in0, enter) \
1939 { \
1940 RTYPE _tmp0 = (RTYPE)in0; \
1941 int _i = 0; \
1942 if (enter) \
1943 printf("\nVP:"); \
1944 for (_i = 0; _i < element_num; _i++) \
1945 printf("%d,", _tmp0[_i]); \
1946 }
1947
1948 #endif /* LOONGSON_INTRINSICS_H */
1949 #endif /* INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H */
1950