1 /* 2 * Copyright © 2020 Loongson Technology Co. Ltd. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 * Author: Yin Shiyou (yinshiyou-hf@loongson.cn) 24 * Gu Xiwei (guxiwei-hf@loongson.cn) 25 */ 26 27 /* 28 * This header file is copied from loongson LSOM project. 29 * MSA macros is implemented with msa intrinsics in msa.h, 30 * and used for simplifing MSA optimization. 31 */ 32 33 #ifndef _MSA_MACROS_H 34 #define _MSA_MACROS_H 1 35 #define MSA_MACROS_VERSION 18 36 #include <msa.h> 37 38 #if (__mips_isa_rev >= 6) 39 #define LH(psrc) \ 40 ( { \ 41 uint16_t val_lh_m = *(uint16_t *)(psrc); \ 42 val_lh_m; \ 43 } ) 44 45 #define LW(psrc) \ 46 ( { \ 47 uint32_t val_lw_m = *(uint32_t *)(psrc); \ 48 val_lw_m; \ 49 } ) 50 51 #if (__mips == 64) 52 #define LD(psrc) \ 53 ( { \ 54 uint64_t val_ld_m = *(uint64_t *)(psrc); \ 55 val_ld_m; \ 56 } ) 57 #else // !(__mips == 64) 58 #define LD(psrc) \ 59 ( { \ 60 uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ 61 uint32_t val0_ld_m, val1_ld_m; \ 62 uint64_t val_ld_m = 0; \ 63 \ 64 val0_ld_m = LW(psrc_ld_m); \ 65 val1_ld_m = LW(psrc_ld_m + 4); \ 66 \ 67 val_ld_m = (uint64_t) (val1_ld_m); \ 68 val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \ 69 val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \ 70 \ 71 val_ld_m; \ 72 } ) 73 #endif // (__mips == 64) 74 75 #define SH(val, pdst) *(uint16_t *)(pdst) = (val); 76 #define SW(val, pdst) *(uint32_t *)(pdst) = (val); 77 #define SD(val, pdst) *(uint64_t *)(pdst) = (val); 78 79 #else // !(__mips_isa_rev >= 6) 80 #define LH(psrc) \ 81 ( { \ 82 uint8_t *psrc_lh_m = (uint8_t *) (psrc); \ 83 uint16_t val_lh_m; \ 84 \ 85 __asm__ volatile ( \ 86 "ulh %[val_lh_m], %[psrc_lh_m] \n\t" \ 87 \ 88 : [val_lh_m] "=r" (val_lh_m) \ 89 : [psrc_lh_m] "m" (*psrc_lh_m) \ 90 ); \ 91 \ 92 val_lh_m; \ 93 } ) 94 95 #define LW(psrc) \ 96 ( { \ 97 uint8_t *psrc_lw_m = (uint8_t *) (psrc); \ 98 uint32_t val_lw_m; \ 99 \ 100 __asm__ volatile ( \ 101 "ulw %[val_lw_m], %[psrc_lw_m] \n\t" \ 102 \ 103 : [val_lw_m] "=r" (val_lw_m) \ 104 : [psrc_lw_m] "m" (*psrc_lw_m) \ 105 ); \ 106 \ 107 val_lw_m; \ 108 } ) 109 110 #if (__mips == 64) 111 #define LD(psrc) \ 112 ( { \ 113 uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ 114 uint64_t val_ld_m = 0; \ 115 \ 116 __asm__ volatile ( \ 117 "uld %[val_ld_m], %[psrc_ld_m] \n\t" \ 118 \ 119 : [val_ld_m] "=r" (val_ld_m) \ 120 : [psrc_ld_m] "m" (*psrc_ld_m) \ 121 ); \ 122 \ 123 val_ld_m; \ 124 } ) 125 #else // !(__mips == 64) 126 #define LD(psrc) \ 127 ( { \ 128 uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ 129 uint32_t val0_ld_m, val1_ld_m; \ 130 uint64_t val_ld_m = 0; \ 131 \ 132 val0_ld_m = LW(psrc_ld_m); \ 133 val1_ld_m = LW(psrc_ld_m + 4); \ 134 \ 135 val_ld_m = (uint64_t) (val1_ld_m); \ 136 val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \ 137 val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \ 138 \ 139 val_ld_m; \ 140 } ) 141 #endif // (__mips == 64) 142 143 #define SH(val, pdst) \ 144 { \ 145 uint8_t *pdst_sh_m = (uint8_t *) (pdst); \ 146 uint16_t val_sh_m = (val); \ 147 \ 148 __asm__ volatile ( \ 149 "ush %[val_sh_m], %[pdst_sh_m] \n\t" \ 150 \ 151 : [pdst_sh_m] "=m" (*pdst_sh_m) \ 152 : [val_sh_m] "r" (val_sh_m) \ 153 ); \ 154 } 155 156 #define SW(val, pdst) \ 157 { \ 158 uint8_t *pdst_sw_m = (uint8_t *) (pdst); \ 159 uint32_t val_sw_m = (val); \ 160 \ 161 __asm__ volatile ( \ 162 "usw %[val_sw_m], %[pdst_sw_m] \n\t" \ 163 \ 164 : [pdst_sw_m] "=m" (*pdst_sw_m) \ 165 : [val_sw_m] "r" (val_sw_m) \ 166 ); \ 167 } 168 169 #define SD(val, pdst) \ 170 { \ 171 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ 172 uint32_t val0_sd_m, val1_sd_m; \ 173 \ 174 val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \ 175 val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \ 176 \ 177 SW(val0_sd_m, pdst_sd_m); \ 178 SW(val1_sd_m, pdst_sd_m + 4); \ 179 } 180 #endif // (__mips_isa_rev >= 6) 181 182 183 184 185 186 187 /* Description : Load vector elements with stride. 188 * Arguments : Inputs - psrc (source pointer to load from) 189 * - stride 190 * Outputs - out0, out1... 191 * Return Type - as per RTYPE 192 * Details : Loads elements in 'out0' from (psrc). 193 * Loads elements in 'out1' from (psrc + stride). 194 */ 195 #define MSA_LD_V(RTYPE, psrc, out) (out) = *((RTYPE *)(psrc)); 196 197 #define MSA_LD_V2(RTYPE, psrc, stride, out0, out1) \ 198 { \ 199 MSA_LD_V(RTYPE, (psrc), out0); \ 200 MSA_LD_V(RTYPE, (psrc) + (stride), out1); \ 201 } 202 203 #define MSA_LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 204 { \ 205 MSA_LD_V2(RTYPE, (psrc), stride, out0, out1); \ 206 MSA_LD_V2(RTYPE, (psrc) + 2 * (stride) , stride, out2, out3); \ 207 } 208 209 #define MSA_LD_V8(RTYPE, psrc, stride, out0, out1, out2, out3, \ 210 out4, out5, out6, out7) \ 211 { \ 212 MSA_LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 213 MSA_LD_V4(RTYPE, (psrc) + 4 * (stride), stride, out4, out5, out6, out7); \ 214 } 215 216 /* Description : Store vectors with stride. 217 * Arguments : Inputs - in0, in1... (source vector to be stored) 218 * - stride 219 * Outputs - pdst (destination pointer to store to) 220 * Details : Stores elements from 'in0' to (pdst). 221 * Stores elements from 'in1' to (pdst + stride). 222 */ 223 #define MSA_ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in); 224 225 #define MSA_ST_V2(RTYPE, in0, in1, pdst, stride) \ 226 { \ 227 MSA_ST_V(RTYPE, in0, (pdst)); \ 228 MSA_ST_V(RTYPE, in1, (pdst) + (stride)); \ 229 } 230 231 #define MSA_ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \ 232 { \ 233 MSA_ST_V2(RTYPE, in0, in1, (pdst), stride); \ 234 MSA_ST_V2(RTYPE, in2, in3, (pdst) + 2 * (stride), stride); \ 235 } 236 237 #define MSA_ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 238 { \ 239 MSA_ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ 240 MSA_ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * (stride), stride); \ 241 } 242 243 /* Description : Store half word elements of vector with stride. 244 * Arguments : Inputs - in (source vector) 245 * - pdst (destination pointer to store to) 246 * - stride 247 * Details : Stores half word 'idx0' from 'in' to (pdst). 248 * Stores half word 'idx1' from 'in' to (pdst + stride). 249 * Similar for other elements. 250 */ 251 #define MSA_ST_H(in, idx, pdst) \ 252 { \ 253 uint16_t out0_m; \ 254 out0_m = __msa_copy_u_h((v8i16) in, idx); \ 255 SH(out0_m, (pdst)); \ 256 } 257 #define MSA_ST_H2(in, idx0, idx1, pdst, stride) \ 258 { \ 259 uint16_t out0_m, out1_m; \ 260 out0_m = __msa_copy_u_h((v8i16) in, idx0); \ 261 out1_m = __msa_copy_u_h((v8i16) in, idx1); \ 262 SH(out0_m, (pdst)); \ 263 SH(out1_m, (pdst) + stride); \ 264 } 265 #define MSA_ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \ 266 { \ 267 uint16_t out0_m, out1_m, out2_m, out3_m; \ 268 out0_m = __msa_copy_u_h((v8i16) in, idx0); \ 269 out1_m = __msa_copy_u_h((v8i16) in, idx1); \ 270 out2_m = __msa_copy_u_h((v8i16) in, idx2); \ 271 out3_m = __msa_copy_u_h((v8i16) in, idx3); \ 272 SH(out0_m, (pdst)); \ 273 SH(out1_m, (pdst) + stride); \ 274 SH(out2_m, (pdst) + 2 * stride); \ 275 SH(out3_m, (pdst) + 3 * stride); \ 276 } 277 #define MSA_ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, \ 278 idx6, idx7, pdst, stride) \ 279 { \ 280 MSA_ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \ 281 MSA_ST_H4(in, idx4, idx5, idx6, idx7, (pdst) + 4*stride, stride) \ 282 } 283 284 /* Description : Store word elements of vector with stride. 285 * Arguments : Inputs - in (source vector) 286 * - pdst (destination pointer to store to) 287 * - stride 288 * Details : Stores word 'idx0' from 'in' to (pdst). 289 * Stores word 'idx1' from 'in' to (pdst + stride). 290 * Similar for other elements. 291 */ 292 #define MSA_ST_W(in, idx, pdst) \ 293 { \ 294 uint32_t out0_m; \ 295 out0_m = __msa_copy_u_w((v4i32) in, idx); \ 296 SW(out0_m, (pdst)); \ 297 } 298 #define MSA_ST_W2(in, idx0, idx1, pdst, stride) \ 299 { \ 300 uint32_t out0_m, out1_m; \ 301 out0_m = __msa_copy_u_w((v4i32) in, idx0); \ 302 out1_m = __msa_copy_u_w((v4i32) in, idx1); \ 303 SW(out0_m, (pdst)); \ 304 SW(out1_m, (pdst) + stride); \ 305 } 306 #define MSA_ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride) \ 307 { \ 308 uint32_t out0_m, out1_m, out2_m, out3_m; \ 309 out0_m = __msa_copy_u_w((v4i32) in, idx0); \ 310 out1_m = __msa_copy_u_w((v4i32) in, idx1); \ 311 out2_m = __msa_copy_u_w((v4i32) in, idx2); \ 312 out3_m = __msa_copy_u_w((v4i32) in, idx3); \ 313 SW(out0_m, (pdst)); \ 314 SW(out1_m, (pdst) + stride); \ 315 SW(out2_m, (pdst) + 2*stride); \ 316 SW(out3_m, (pdst) + 3*stride); \ 317 } 318 #define MSA_ST_W8(in0, in1, idx0, idx1, idx2, idx3, \ 319 idx4, idx5, idx6, idx7, pdst, stride) \ 320 { \ 321 MSA_ST_W4(in0, idx0, idx1, idx2, idx3, pdst, stride) \ 322 MSA_ST_W4(in1, idx4, idx5, idx6, idx7, pdst + 4*stride, stride) \ 323 } 324 325 /* Description : Store double word elements of vector with stride. 326 * Arguments : Inputs - in (source vector) 327 * - pdst (destination pointer to store to) 328 * - stride 329 * Details : Stores double word 'idx0' from 'in' to (pdst). 330 * Stores double word 'idx1' from 'in' to (pdst + stride). 331 * Similar for other elements. 332 */ 333 #define MSA_ST_D(in, idx, pdst) \ 334 { \ 335 uint64_t out0_m; \ 336 out0_m = __msa_copy_u_d((v2i64) in, idx); \ 337 SD(out0_m, (pdst)); \ 338 } 339 #define MSA_ST_D2(in, idx0, idx1, pdst, stride) \ 340 { \ 341 uint64_t out0_m, out1_m; \ 342 out0_m = __msa_copy_u_d((v2i64) in, idx0); \ 343 out1_m = __msa_copy_u_d((v2i64) in, idx1); \ 344 SD(out0_m, (pdst)); \ 345 SD(out1_m, (pdst) + stride); \ 346 } 347 #define MSA_ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ 348 { \ 349 uint64_t out0_m, out1_m, out2_m, out3_m; \ 350 out0_m = __msa_copy_u_d((v2i64) in0, idx0); \ 351 out1_m = __msa_copy_u_d((v2i64) in0, idx1); \ 352 out2_m = __msa_copy_u_d((v2i64) in1, idx2); \ 353 out3_m = __msa_copy_u_d((v2i64) in1, idx3); \ 354 SD(out0_m, (pdst)); \ 355 SD(out1_m, (pdst) + stride); \ 356 SD(out2_m, (pdst) + 2 * stride); \ 357 SD(out3_m, (pdst) + 3 * stride); \ 358 } 359 #define MSA_ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, \ 360 idx4, idx5, idx6, idx7, pdst, stride) \ 361 { \ 362 MSA_ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ 363 MSA_ST_D4(in2, in3, idx4, idx5, idx6, idx7, pdst + 4 * stride, stride) \ 364 } 365 366 /* Description : Shuffle byte vector elements as per mask vector. 367 * Arguments : Inputs - in0, in1 (source vectors) 368 * - mask (mask vectors) 369 * Outputs - out (dstination vectors) 370 * Return Type - as per RTYPE 371 * Details : Selective byte elements from 'in0' & 'in1' are copied to 'out' as 372 * per control vector 'mask'. 373 */ 374 #define MSA_VSHF_B(RTYPE, in0, in1, mask, out) \ 375 { \ 376 out = (RTYPE) __msa_vshf_b((v16i8) mask, (v16i8) in0, (v16i8) in1); \ 377 } 378 379 #define MSA_VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 380 { \ 381 MSA_VSHF_B(RTYPE, in0, in1, mask0, out0) \ 382 MSA_VSHF_B(RTYPE, in2, in3, mask1, out1) \ 383 } 384 385 #define MSA_VSHF_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 386 mask0, mask1, mask2, mask3, out0, out1, out2, out3) \ 387 { \ 388 MSA_VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \ 389 MSA_VSHF_B2(RTYPE, in4, in5, in6, in7, mask2, mask3, out2, out3); \ 390 } 391 392 /* Description : Shuffle halfword vector elements as per mask vector. 393 * Arguments : Inputs - in0, in1 (source vectors) 394 * - mask (mask vectors) 395 * Outputs - out (dstination vectors) 396 * Return Type - as per RTYPE 397 * Details : Selective halfword elements from 'in0' & 'in1' are copied to 'out' as 398 * per control vector 'mask'. 399 */ 400 #define MSA_VSHF_H(RTYPE, in0, in1, mask, out) \ 401 { \ 402 out = (RTYPE) __msa_vshf_h((v8i16) mask, (v8i16) in0, (v8i16) in1); \ 403 } 404 405 #define MSA_VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 406 { \ 407 MSA_VSHF_H(RTYPE, in0, in1, mask0, out0) \ 408 MSA_VSHF_H(RTYPE, in2, in3, mask1, out1) \ 409 } 410 411 #define MSA_VSHF_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 412 mask0, mask1, mask2, mask3, out0, out1, out2, out3) \ 413 { \ 414 MSA_VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \ 415 MSA_VSHF_H2(RTYPE, in4, in5, in6, in7, mask2, mask3, out2, out3); \ 416 } 417 418 /* Description : Shuffle word vector elements as per mask vector. 419 * Arguments : Inputs - in0, in1 (source vectors) 420 * - mask (mask vectors) 421 * Outputs - out (dstination vectors) 422 * Return Type - as per RTYPE 423 * Details : Selective word elements from 'in0' & 'in1' are copied to 'out' as 424 * per control vector 'mask'. 425 */ 426 #define MSA_VSHF_W(RTYPE, in0, in1, mask, out) \ 427 { \ 428 out = (RTYPE) __msa_vshf_w((v4i32) mask, (v4i32) in0, (v4i32) in1); \ 429 } 430 431 #define MSA_VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 432 { \ 433 MSA_VSHF_W(RTYPE, in0, in1, mask0, out0) \ 434 MSA_VSHF_W(RTYPE, in2, in3, mask1, out1) \ 435 } 436 437 #define MSA_VSHF_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 438 mask0, mask1, mask2, mask3, out0, out1, out2, out3) \ 439 { \ 440 MSA_VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \ 441 MSA_VSHF_W2(RTYPE, in4, in5, in6, in7, mask2, mask3, out2, out3); \ 442 } 443 444 /* Description : Interleave even byte elements from vectors. 445 * Arguments : Inputs - in0, in1 446 * Outputs - out 447 * Return Type - as per RTYPE 448 * Details : Even byte elements of 'in0' and even byte 449 * elements of 'in1' are interleaved and copied to 'out'. 450 */ 451 #define MSA_ILVEV_B(RTYPE, in0, in1, out) \ 452 { \ 453 out = (RTYPE) __msa_ilvev_b((v16i8) in0, (v16i8) in1); \ 454 } 455 456 #define MSA_ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 457 { \ 458 MSA_ILVEV_B(RTYPE, in0, in1, out0); \ 459 MSA_ILVEV_B(RTYPE, in2, in3, out1); \ 460 } 461 462 #define MSA_ILVEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 463 out0, out1, out2, out3) \ 464 { \ 465 MSA_ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 466 MSA_ILVEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 467 } 468 469 /* Description : Interleave even half word elements from vectors. 470 * Arguments : Inputs - in0, in1 471 * Outputs - out 472 * Return Type - as per RTYPE 473 * Details : Even half word elements of 'in0' and even half word 474 * elements of 'in1' are interleaved and copied to 'out'. 475 */ 476 #define MSA_ILVEV_H(RTYPE, in0, in1, out) \ 477 { \ 478 out = (RTYPE) __msa_ilvev_h((v8i16) in0, (v8i16) in1); \ 479 } 480 481 #define MSA_ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 482 { \ 483 MSA_ILVEV_H(RTYPE, in0, in1, out0); \ 484 MSA_ILVEV_H(RTYPE, in2, in3, out1); \ 485 } 486 487 #define MSA_ILVEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 488 out0, out1, out2, out3) \ 489 { \ 490 MSA_ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 491 MSA_ILVEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 492 } 493 494 /* Description : Interleave even word elements from vectors. 495 * Arguments : Inputs - in0, in1 496 * Outputs - out 497 * Return Type - as per RTYPE 498 * Details : Even word elements of 'in0' and even word 499 * elements of 'in1' are interleaved and copied to 'out'. 500 */ 501 #define MSA_ILVEV_W(RTYPE, in0, in1, out) \ 502 { \ 503 out = (RTYPE) __msa_ilvev_w((v2i64) in0, (v2i64) in1); \ 504 } 505 506 #define MSA_ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 507 { \ 508 MSA_ILVEV_W(RTYPE, in0, in1, out0); \ 509 MSA_ILVEV_W(RTYPE, in2, in3, out1); \ 510 } 511 512 #define MSA_ILVEV_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 513 out0, out1, out2, out3) \ 514 { \ 515 MSA_ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ 516 MSA_ILVEV_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ 517 } 518 519 /* Description : Interleave even double word elements from vectors. 520 * Arguments : Inputs - in0, in1 521 * Outputs - out 522 * Return Type - as per RTYPE 523 * Details : Even double word elements of 'in0' and even double word 524 * elements of 'in1' are interleaved and copied to 'out'. 525 */ 526 #define MSA_ILVEV_D(RTYPE, in0, in1, out) \ 527 { \ 528 out = (RTYPE) __msa_ilvev_d((v2i64) in0, (v2i64) in1); \ 529 } 530 531 #define MSA_ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 532 { \ 533 MSA_ILVEV_D(RTYPE, in0, in1, out0); \ 534 MSA_ILVEV_D(RTYPE, in2, in3, out1); \ 535 } 536 537 #define MSA_ILVEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 538 out0, out1, out2, out3) \ 539 { \ 540 MSA_ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 541 MSA_ILVEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 542 } 543 544 /* Description : Interleave odd byte elements from vectors. 545 * Arguments : Inputs - in0, in1 546 * Outputs - out 547 * Return Type - as per RTYPE 548 * Details : Odd byte elements of 'in0' and odd byte 549 * elements of 'in1' are interleaved and copied to 'out'. 550 */ 551 #define MSA_ILVOD_B(RTYPE, in0, in1, out) \ 552 { \ 553 out = (RTYPE) __msa_ilvod_b((v16i8) in0, (v16i8) in1); \ 554 } 555 556 #define MSA_ILVOD_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 557 { \ 558 MSA_ILVOD_B(RTYPE, in0, in1, out0); \ 559 MSA_ILVOD_B(RTYPE, in2, in3, out1); \ 560 } 561 562 #define MSA_ILVOD_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 563 out0, out1, out2, out3) \ 564 { \ 565 MSA_ILVOD_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 566 MSA_ILVOD_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 567 } 568 569 /* Description : Interleave odd half word elements from vectors. 570 * Arguments : Inputs - in0, in1 571 * Outputs - out 572 * Return Type - as per RTYPE 573 * Details : Odd half word elements of 'in0' and odd half word 574 * elements of 'in1' are interleaved and copied to 'out'. 575 */ 576 #define MSA_ILVOD_H(RTYPE, in0, in1, out) \ 577 { \ 578 out = (RTYPE) __msa_ilvod_h((v8i16) in0, (v8i16) in1); \ 579 } 580 581 #define MSA_ILVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 582 { \ 583 MSA_ILVOD_H(RTYPE, in0, in1, out0); \ 584 MSA_ILVOD_H(RTYPE, in2, in3, out1); \ 585 } 586 587 #define MSA_ILVOD_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 588 out0, out1, out2, out3) \ 589 { \ 590 MSA_ILVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 591 MSA_ILVOD_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 592 } 593 594 /* Description : Interleave odd word elements from vectors. 595 * Arguments : Inputs - in0, in1 596 * Outputs - out 597 * Return Type - as per RTYPE 598 * Details : Odd word elements of 'in0' and odd word 599 * elements of 'in1' are interleaved and copied to 'out'. 600 */ 601 #define MSA_ILVOD_W(RTYPE, in0, in1, out) \ 602 { \ 603 out = (RTYPE) __msa_ilvod_w((v4i32) in0, (v4i32) in1); \ 604 } 605 606 #define MSA_ILVOD_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 607 { \ 608 MSA_ILVOD_W(RTYPE, in0, in1, out0); \ 609 MSA_ILVOD_W(RTYPE, in2, in3, out1); \ 610 } 611 612 #define MSA_ILVOD_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 613 out0, out1, out2, out3) \ 614 { \ 615 MSA_ILVOD_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ 616 MSA_ILVOD_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ 617 } 618 619 /* Description : Interleave odd double word elements from vectors. 620 * Arguments : Inputs - in0, in1 621 * Outputs - out 622 * Return Type - as per RTYPE 623 * Details : Odd double word elements of 'in0' and odd double word 624 * elements of 'in1' are interleaved and copied to 'out'. 625 */ 626 #define MSA_ILVOD_D(RTYPE, in0, in1, out) \ 627 { \ 628 out = (RTYPE) __msa_ilvod_d((v2i64) in0, (v2i64) in1); \ 629 } 630 631 #define MSA_ILVOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 632 { \ 633 MSA_ILVOD_D(RTYPE, in0, in1, out0); \ 634 MSA_ILVOD_D(RTYPE, in2, in3, out1); \ 635 } 636 637 #define MSA_ILVOD_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 638 out0, out1, out2, out3) \ 639 { \ 640 MSA_ILVOD_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 641 MSA_ILVOD_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 642 } 643 644 /* Description : Interleave left half of byte elements from vectors. 645 * Arguments : Inputs - in0, in1 646 * Outputs - out 647 * Return Type - as per RTYPE 648 * Details : Left half of byte elements of 'in0' and left half of byte 649 * elements of 'in1' are interleaved and copied to 'out'. 650 */ 651 #define MSA_ILVL_B(RTYPE, in0, in1, out) \ 652 { \ 653 out = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \ 654 } 655 656 #define MSA_ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 657 { \ 658 MSA_ILVL_B(RTYPE, in0, in1, out0); \ 659 MSA_ILVL_B(RTYPE, in2, in3, out1); \ 660 } 661 662 #define MSA_ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 663 out0, out1, out2, out3) \ 664 { \ 665 MSA_ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 666 MSA_ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 667 } 668 669 /* Description : Interleave left half of halfword elements from vectors. 670 * Arguments : Inputs - in0, in1 671 * Outputs - out 672 * Return Type - as per RTYPE 673 * Details : Left half of halfword elements of 'in0' and left half of halfword 674 * elements of 'in1' are interleaved and copied to 'out'. 675 */ 676 #define MSA_ILVL_H(RTYPE, in0, in1, out) \ 677 { \ 678 out = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \ 679 } 680 681 #define MSA_ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 682 { \ 683 MSA_ILVL_H(RTYPE, in0, in1, out0); \ 684 MSA_ILVL_H(RTYPE, in2, in3, out1); \ 685 } 686 687 #define MSA_ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 688 out0, out1, out2, out3) \ 689 { \ 690 MSA_ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 691 MSA_ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 692 } 693 694 /* Description : Interleave left half of word elements from vectors. 695 * Arguments : Inputs - in0, in1 696 * Outputs - out 697 * Return Type - as per RTYPE 698 * Details : Left half of word elements of 'in0' and left half of word 699 * elements of 'in1' are interleaved and copied to 'out'. 700 */ 701 #define MSA_ILVL_W(RTYPE, in0, in1, out) \ 702 { \ 703 out = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \ 704 } 705 706 #define MSA_ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 707 { \ 708 MSA_ILVL_W(RTYPE, in0, in1, out0); \ 709 MSA_ILVL_W(RTYPE, in2, in3, out1); \ 710 } 711 712 #define MSA_ILVL_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 713 out0, out1, out2, out3) \ 714 { \ 715 MSA_ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ 716 MSA_ILVL_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ 717 } 718 719 /* Description : Interleave left half of double word elements from vectors. 720 * Arguments : Inputs - in0, in1 721 * Outputs - out 722 * Return Type - as per RTYPE 723 * Details : Left half of double word elements of 'in0' and left half of 724 * double word elements of 'in1' are interleaved and copied to 'out'. 725 */ 726 #define MSA_ILVL_D(RTYPE, in0, in1, out) \ 727 { \ 728 out = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \ 729 } 730 731 #define MSA_ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 732 { \ 733 MSA_ILVL_D(RTYPE, in0, in1, out0); \ 734 MSA_ILVL_D(RTYPE, in2, in3, out1); \ 735 } 736 737 #define MSA_ILVL_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 738 out0, out1, out2, out3) \ 739 { \ 740 MSA_ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 741 MSA_ILVL_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 742 } 743 744 /* Description : Interleave right half of byte elements from vectors. 745 * Arguments : Inputs - in0, in1 746 * Outputs - out 747 * Return Type - as per RTYPE 748 * Details : Right half of byte elements of 'in0' and right half of byte 749 * elements of 'in1' are interleaved and copied to 'out'. 750 */ 751 #define MSA_ILVR_B(RTYPE, in0, in1, out) \ 752 { \ 753 out = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \ 754 } 755 756 #define MSA_ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 757 { \ 758 MSA_ILVR_B(RTYPE, in0, in1, out0); \ 759 MSA_ILVR_B(RTYPE, in2, in3, out1); \ 760 } 761 762 #define MSA_ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 763 out0, out1, out2, out3) \ 764 { \ 765 MSA_ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 766 MSA_ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 767 } 768 769 /* Description : Interleave right half of halfword elements from vectors. 770 * Arguments : Inputs - in0, in1 771 * Outputs - out 772 * Return Type - as per RTYPE 773 * Details : Right half of halfword elements of 'in0' and right half of halfword 774 * elements of 'in1' are interleaved and copied to 'out'. 775 */ 776 #define MSA_ILVR_H(RTYPE, in0, in1, out) \ 777 { \ 778 out = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \ 779 } 780 781 #define MSA_ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 782 { \ 783 MSA_ILVR_H(RTYPE, in0, in1, out0); \ 784 MSA_ILVR_H(RTYPE, in2, in3, out1); \ 785 } 786 787 #define MSA_ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 788 out0, out1, out2, out3) \ 789 { \ 790 MSA_ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 791 MSA_ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 792 } 793 794 /* Description : Interleave right half of word elements from vectors. 795 * Arguments : Inputs - in0, in1 796 * Outputs - out 797 * Return Type - as per RTYPE 798 * Details : Right half of word elements of 'in0' and right half of word 799 * elements of 'in1' are interleaved and copied to 'out'. 800 */ 801 #define MSA_ILVR_W(RTYPE, in0, in1, out) \ 802 { \ 803 out = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \ 804 } 805 806 #define MSA_ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 807 { \ 808 MSA_ILVR_W(RTYPE, in0, in1, out0); \ 809 MSA_ILVR_W(RTYPE, in2, in3, out1); \ 810 } 811 812 #define MSA_ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 813 out0, out1, out2, out3) \ 814 { \ 815 MSA_ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ 816 MSA_ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ 817 } 818 819 /* Description : Interleave right half of double word elements from vectors. 820 * Arguments : Inputs - in0, in1 821 * Outputs - out 822 * Return Type - as per RTYPE 823 * Details : Right half of double word elements of 'in0' and right half of 824 * double word elements of 'in1' are interleaved and copied to 'out'. 825 */ 826 #define MSA_ILVR_D(RTYPE, in0, in1, out) \ 827 { \ 828 out = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \ 829 } 830 831 #define MSA_ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 832 { \ 833 MSA_ILVR_D(RTYPE, in0, in1, out0); \ 834 MSA_ILVR_D(RTYPE, in2, in3, out1); \ 835 } 836 837 #define MSA_ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 838 out0, out1, out2, out3) \ 839 { \ 840 MSA_ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 841 MSA_ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 842 } 843 844 /* Description : Interleave both left and right half of input vectors. 845 * Arguments : Inputs - in0, in1 846 * Outputs - out0, out1 847 * Return Type - as per RTYPE 848 * Details : Right half of byte elements from 'in0' and 'in1' are 849 * interleaved and stored to 'out0'. 850 * Left half of byte elements from 'in0' and 'in1' are 851 * interleaved and stored to 'out1'. 852 */ 853 #define MSA_ILVRL_B2(RTYPE, in0, in1, out0, out1) \ 854 { \ 855 MSA_ILVR_B(RTYPE, in0, in1, out0); \ 856 MSA_ILVL_B(RTYPE, in0, in1, out1); \ 857 } 858 859 #define MSA_ILVRL_B4(RTYPE, in0, in1, in2, in3, \ 860 out0, out1, out2, out3) \ 861 { \ 862 MSA_ILVRL_B2(RTYPE, in0, in1, out0, out1); \ 863 MSA_ILVRL_B2(RTYPE, in2, in3, out2, out3); \ 864 } 865 866 /* Description : Interleave both left and right half of input vectors. 867 * Arguments : Inputs - in0, in1 868 * Outputs - out0, out1 869 * Return Type - as per RTYPE 870 * Details : Right half of halfword elements from 'in0' and 'in1' are 871 * interleaved and stored to 'out0'. 872 * Left half of halfword elements from 'in0' and 'in1' are 873 * interleaved and stored to 'out1'. 874 */ 875 #define MSA_ILVRL_H2(RTYPE, in0, in1, out0, out1) \ 876 { \ 877 MSA_ILVR_H(RTYPE, in0, in1, out0); \ 878 MSA_ILVL_H(RTYPE, in0, in1, out1); \ 879 } 880 881 #define MSA_ILVRL_H4(RTYPE, in0, in1, in2, in3, \ 882 out0, out1, out2, out3) \ 883 { \ 884 MSA_ILVRL_H2(RTYPE, in0, in1, out0, out1); \ 885 MSA_ILVRL_H2(RTYPE, in2, in3, out2, out3); \ 886 } 887 888 /* Description : Interleave both left and right half of input vectors. 889 * Arguments : Inputs - in0, in1 890 * Outputs - out0, out1 891 * Return Type - as per RTYPE 892 * Details : Right half of word elements from 'in0' and 'in1' are 893 * interleaved and stored to 'out0'. 894 * Left half of word elements from 'in0' and 'in1' are 895 * interleaved and stored to 'out1'. 896 */ 897 #define MSA_ILVRL_W2(RTYPE, in0, in1, out0, out1) \ 898 { \ 899 MSA_ILVR_W(RTYPE, in0, in1, out0); \ 900 MSA_ILVL_W(RTYPE, in0, in1, out1); \ 901 } 902 903 #define MSA_ILVRL_W4(RTYPE, in0, in1, in2, in3, \ 904 out0, out1, out2, out3) \ 905 { \ 906 MSA_ILVRL_W2(RTYPE, in0, in1, out0, out1); \ 907 MSA_ILVRL_W2(RTYPE, in2, in3, out2, out3); \ 908 } 909 910 /* Description : Interleave both left and right half of input vectors. 911 * Arguments : Inputs - in0, in1 912 * Outputs - out0, out1 913 * Return Type - as per RTYPE 914 * Details : Right half of double word elements from 'in0' and 'in1' are 915 * interleaved and stored to 'out0'. 916 * Left half of double word elements from 'in0' and 'in1' are 917 * interleaved and stored to 'out1'. 918 */ 919 #define MSA_ILVRL_D2(RTYPE, in0, in1, out0, out1) \ 920 { \ 921 MSA_ILVR_D(RTYPE, in0, in1, out0); \ 922 MSA_ILVL_D(RTYPE, in0, in1, out1); \ 923 } 924 925 #define MSA_ILVRL_D4(RTYPE, in0, in1, in2, in3, \ 926 out0, out1, out2, out3) \ 927 { \ 928 MSA_ILVRL_D2(RTYPE, in0, in1, out0, out1); \ 929 MSA_ILVRL_D2(RTYPE, in2, in3, out2, out3); \ 930 } 931 932 /* Description : Indexed byte elements are replicated to all elements in 933 * output vector. 934 * Arguments : Inputs - in, idx 935 * Outputs - out 936 * Return Type - as per RTYPE 937 * Details : 'idx' element value from 'in' vector is replicated to all 938 * elements in 'out' vector. 939 * Valid index range for halfword operation is 0-7. 940 */ 941 #define MSA_SPLATI_B(RTYPE, in, idx, out) \ 942 { \ 943 out = (RTYPE) __msa_splati_b((v16i8) in, idx); \ 944 } 945 946 #define MSA_SPLATI_B2(RTYPE, in, idx0, idx1, out0, out1) \ 947 { \ 948 MSA_SPLATI_B(RTYPE, in, idx0, out0) \ 949 MSA_SPLATI_B(RTYPE, in, idx1, out1) \ 950 } 951 952 #define MSA_SPLATI_B4(RTYPE, in, idx0, idx1, idx2, idx3, \ 953 out0, out1, out2, out3) \ 954 { \ 955 MSA_SPLATI_B2(RTYPE, in, idx0, idx1, out0, out1) \ 956 MSA_SPLATI_B2(RTYPE, in, idx2, idx3, out2, out3) \ 957 } 958 959 /* Description : Indexed halfword elements are replicated to all elements in 960 * output vector. 961 * Arguments : Inputs - in, idx 962 * Outputs - out 963 * Return Type - as per RTYPE 964 * Details : 'idx' element value from 'in' vector is replicated to all 965 * elements in 'out' vector. 966 * Valid index range for halfword operation is 0-7. 967 */ 968 #define MSA_SPLATI_H(RTYPE, in, idx, out) \ 969 { \ 970 out = (RTYPE) __msa_splati_h((v8i16) in, idx); \ 971 } 972 973 #define MSA_SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ 974 { \ 975 MSA_SPLATI_H(RTYPE, in, idx0, out0) \ 976 MSA_SPLATI_H(RTYPE, in, idx1, out1) \ 977 } 978 979 #define MSA_SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \ 980 out0, out1, out2, out3) \ 981 { \ 982 MSA_SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ 983 MSA_SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3) \ 984 } 985 986 /* Description : Indexed word elements are replicated to all elements in 987 * output vector. 988 * Arguments : Inputs - in, idx 989 * Outputs - out 990 * Return Type - as per RTYPE 991 * Details : 'idx' element value from 'in' vector is replicated to all 992 * elements in 'out' vector. 993 * Valid index range for halfword operation is 0-3. 994 */ 995 #define MSA_SPLATI_W(RTYPE, in, idx, out) \ 996 { \ 997 out = (RTYPE) __msa_splati_w((v4i32) in, idx); \ 998 } 999 1000 #define MSA_SPLATI_W2(RTYPE, in, idx0, idx1, out0, out1) \ 1001 { \ 1002 MSA_SPLATI_W(RTYPE, in, idx0, out0) \ 1003 MSA_SPLATI_W(RTYPE, in, idx1, out1) \ 1004 } 1005 1006 #define MSA_SPLATI_W4(RTYPE, in, idx0, idx1, idx2, idx3, \ 1007 out0, out1, out2, out3) \ 1008 { \ 1009 MSA_SPLATI_W2(RTYPE, in, idx0, idx1, out0, out1) \ 1010 MSA_SPLATI_W2(RTYPE, in, idx2, idx3, out2, out3) \ 1011 } 1012 1013 /* Description : Pack even byte elements of vector pairs. 1014 * Arguments : Inputs - in0, in1 1015 * Outputs - out 1016 * Return Type - as per RTYPE 1017 * Details : Even byte elements of 'in0' are copied to the left half of 1018 * 'out' & even byte elements of 'in1' are copied to the right 1019 * half of 'out'. 1020 */ 1021 #define MSA_PCKEV_B(RTYPE, in0, in1, out) \ 1022 { \ 1023 out = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \ 1024 } 1025 1026 #define MSA_PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1027 { \ 1028 MSA_PCKEV_B(RTYPE, in0, in1, out0) \ 1029 MSA_PCKEV_B(RTYPE, in2, in3, out1) \ 1030 } 1031 1032 #define MSA_PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, \ 1033 in6, in7, out0, out1, out2, out3) \ 1034 { \ 1035 MSA_PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1036 MSA_PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3) \ 1037 } 1038 1039 /* Description : Pack even halfword elements of vector pairs. 1040 * Arguments : Inputs - in0, in1 1041 * Outputs - out 1042 * Return Type - as per RTYPE 1043 * Details : Even halfword elements of 'in0' are copied to the left half of 1044 * 'out' & even halfword elements of 'in1' are copied to the right 1045 * half of 'out'. 1046 */ 1047 #define MSA_PCKEV_H(RTYPE, in0, in1, out) \ 1048 { \ 1049 out = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \ 1050 } 1051 1052 #define MSA_PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1053 { \ 1054 MSA_PCKEV_H(RTYPE, in0, in1, out0) \ 1055 MSA_PCKEV_H(RTYPE, in2, in3, out1) \ 1056 } 1057 1058 #define MSA_PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, \ 1059 in6, in7, out0, out1, out2, out3) \ 1060 { \ 1061 MSA_PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1062 MSA_PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3) \ 1063 } 1064 1065 /* Description : Pack even word elements of vector pairs. 1066 * Arguments : Inputs - in0, in1 1067 * Outputs - out 1068 * Return Type - as per RTYPE 1069 * Details : Even word elements of 'in0' are copied to the left half of 1070 * 'out' & even word elements of 'in1' are copied to the right 1071 * half of 'out'. 1072 */ 1073 #define MSA_PCKEV_W(RTYPE, in0, in1, out) \ 1074 { \ 1075 out = (RTYPE) __msa_pckev_w((v4i32) in0, (v4i32) in1); \ 1076 } 1077 1078 #define MSA_PCKEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1079 { \ 1080 MSA_PCKEV_W(RTYPE, in0, in1, out0) \ 1081 MSA_PCKEV_W(RTYPE, in2, in3, out1) \ 1082 } 1083 1084 #define MSA_PCKEV_W4(RTYPE, in0, in1, in2, in3, in4, in5, \ 1085 in6, in7, out0, out1, out2, out3) \ 1086 { \ 1087 MSA_PCKEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1088 MSA_PCKEV_W2(RTYPE, in4, in5, in6, in7, out2, out3) \ 1089 } 1090 1091 /* Description : Pack even double word elements of vector pairs. 1092 * Arguments : Inputs - in0, in1 1093 * Outputs - out 1094 * Return Type - as per RTYPE 1095 * Details : Even double word elements of 'in0' are copied to the left 1096 * half of 'out' & even double word elements of 'in1' are 1097 * copied to the right half of 'out'. 1098 */ 1099 #define MSA_PCKEV_D(RTYPE, in0, in1, out) \ 1100 { \ 1101 out = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ 1102 } 1103 1104 #define MSA_PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1105 { \ 1106 MSA_PCKEV_D(RTYPE, in0, in1, out0) \ 1107 MSA_PCKEV_D(RTYPE, in2, in3, out1) \ 1108 } 1109 1110 #define MSA_PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, \ 1111 in6, in7, out0, out1, out2, out3) \ 1112 { \ 1113 MSA_PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1114 MSA_PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3) \ 1115 } 1116 1117 /* Description : Pack odd byte elements of vector pairs. 1118 * Arguments : Inputs - in0, in1 1119 * Outputs - out 1120 * Return Type - as per RTYPE 1121 * Details : Odd byte elements of 'in0' are copied to the left half of 1122 * 'out' & odd byte elements of 'in1' are copied to the right 1123 * half of 'out'. 1124 */ 1125 #define MSA_PCKOD_B(RTYPE, in0, in1, out) \ 1126 { \ 1127 out = (RTYPE) __msa_pckod_b((v16i8) in0, (v16i8) in1); \ 1128 } 1129 1130 #define MSA_PCKOD_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1131 { \ 1132 MSA_PCKOD_B(RTYPE, in0, in1, out0) \ 1133 MSA_PCKOD_B(RTYPE, in2, in3, out1) \ 1134 } 1135 1136 #define MSA_PCKOD_B4(RTYPE, in0, in1, in2, in3, in4, in5, \ 1137 in6, in7, out0, out1, out2, out3) \ 1138 { \ 1139 MSA_PCKOD_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1140 MSA_PCKOD_B2(RTYPE, in4, in5, in6, in7, out2, out3) \ 1141 } 1142 1143 /* Description : Pack odd halfword elements of vector pairs. 1144 * Arguments : Inputs - in0, in1 1145 * Outputs - out 1146 * Return Type - as per RTYPE 1147 * Details : Odd halfword elements of 'in0' are copied to the left half of 1148 * 'out' & odd halfword elements of 'in1' are copied to the right 1149 * half of 'out'. 1150 */ 1151 #define MSA_PCKOD_H(RTYPE, in0, in1, out) \ 1152 { \ 1153 out = (RTYPE) __msa_pckod_h((v8i16) in0, (v8i16) in1); \ 1154 } 1155 1156 #define MSA_PCKOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1157 { \ 1158 MSA_PCKOD_H(RTYPE, in0, in1, out0) \ 1159 MSA_PCKOD_H(RTYPE, in2, in3, out1) \ 1160 } 1161 1162 #define MSA_PCKOD_H4(RTYPE, in0, in1, in2, in3, in4, in5, \ 1163 in6, in7, out0, out1, out2, out3) \ 1164 { \ 1165 MSA_PCKOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1166 MSA_PCKOD_H2(RTYPE, in4, in5, in6, in7, out2, out3) \ 1167 } 1168 1169 /* Description : Pack odd word elements of vector pairs. 1170 * Arguments : Inputs - in0, in1 1171 * Outputs - out 1172 * Return Type - as per RTYPE 1173 * Details : Odd word elements of 'in0' are copied to the left half of 1174 * 'out' & odd word elements of 'in1' are copied to the right 1175 * half of 'out'. 1176 */ 1177 #define MSA_PCKOD_W(RTYPE, in0, in1, out) \ 1178 { \ 1179 out = (RTYPE) __msa_pckod_w((v4i32) in0, (v4i32) in1); \ 1180 } 1181 1182 #define MSA_PCKOD_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1183 { \ 1184 MSA_PCKOD_W(RTYPE, in0, in1, out0) \ 1185 MSA_PCKOD_W(RTYPE, in2, in3, out1) \ 1186 } 1187 1188 #define MSA_PCKOD_W4(RTYPE, in0, in1, in2, in3, in4, in5, \ 1189 in6, in7, out0, out1, out2, out3) \ 1190 { \ 1191 MSA_PCKOD_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1192 MSA_PCKOD_W2(RTYPE, in4, in5, in6, in7, out2, out3) \ 1193 } 1194 1195 /* Description : Pack odd double word elements of vector pairs. 1196 * Arguments : Inputs - in0, in1 1197 * Outputs - out 1198 * Return Type - as per RTYPE 1199 * Details : Odd double word elements of 'in0' are copied to the left 1200 * half of 'out' & odd double word elements of 'in1' are 1201 * copied to the right half of 'out'. 1202 */ 1203 #define MSA_PCKOD_D(RTYPE, in0, in1, out) \ 1204 { \ 1205 out = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \ 1206 } 1207 1208 #define MSA_PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1209 { \ 1210 MSA_PCKOD_D(RTYPE, in0, in1, out0) \ 1211 MSA_PCKOD_D(RTYPE, in2, in3, out1) \ 1212 } 1213 1214 #define MSA_PCKOD_D4(RTYPE, in0, in1, in2, in3, in4, in5, \ 1215 in6, in7, out0, out1, out2, out3) \ 1216 { \ 1217 MSA_PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1218 MSA_PCKOD_D2(RTYPE, in4, in5, in6, in7, out2, out3) \ 1219 } 1220 1221 /* Description : Dot product of unsigned byte vector elements. 1222 * Arguments : Inputs - mult 1223 * cnst 1224 * Outputs - out 1225 * Return Type - as per RTYPE 1226 * Details : Unsigned byte elements from 'mult' are multiplied with 1227 * unsigned byte elements from 'cnst' producing a result 1228 * twice the size of input i.e. unsigned halfword. 1229 * Then this multiplication results of adjacent odd-even elements 1230 * are added together and stored to the out vector. 1231 */ 1232 #define MSA_DOTP_UB(RTYPE, mult, cnst, out) \ 1233 { \ 1234 out = (RTYPE) __msa_dotp_u_h((v16u8) mult, (v16u8) cnst); \ 1235 } 1236 1237 #define MSA_DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 1238 { \ 1239 MSA_DOTP_UB(RTYPE, mult0, cnst0, out0) \ 1240 MSA_DOTP_UB(RTYPE, mult1, cnst1, out1) \ 1241 } 1242 1243 #define MSA_DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \ 1244 cnst0, cnst1, cnst2, cnst3, \ 1245 out0, out1, out2, out3) \ 1246 { \ 1247 MSA_DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 1248 MSA_DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 1249 } 1250 1251 /* Description : Dot product of signed byte vector elements. 1252 * Arguments : Inputs - mult 1253 * cnst 1254 * Outputs - out 1255 * Return Type - as per RTYPE 1256 * Details : Signed byte elements from 'mult' are multiplied with 1257 * signed byte elements from 'cnst' producing a result 1258 * twice the size of input i.e. signed halfword. 1259 * Then this multiplication results of adjacent odd-even elements 1260 * are added together and stored to the out vector. 1261 */ 1262 #define MSA_DOTP_SB(RTYPE, mult, cnst, out) \ 1263 { \ 1264 out = (RTYPE) __msa_dotp_s_h((v16i8) mult, (v16i8) cnst); \ 1265 } 1266 1267 #define MSA_DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 1268 { \ 1269 MSA_DOTP_SB(RTYPE, mult0, cnst0, out0) \ 1270 MSA_DOTP_SB(RTYPE, mult1, cnst1, out1) \ 1271 } 1272 1273 #define MSA_DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \ 1274 cnst0, cnst1, cnst2, cnst3, \ 1275 out0, out1, out2, out3) \ 1276 { \ 1277 MSA_DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 1278 MSA_DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 1279 } 1280 1281 /* Description : Dot product of unsigned halfword vector elements. 1282 * Arguments : Inputs - mult 1283 * cnst 1284 * Outputs - out 1285 * Return Type - as per RTYPE 1286 * Details : Unsigned halfword elements from 'mult' are multiplied with 1287 * unsigned halfword elements from 'cnst' producing a result 1288 * twice the size of input i.e. unsigned word. 1289 * Then this multiplication results of adjacent odd-even elements 1290 * are added together and stored to the out vector. 1291 */ 1292 #define MSA_DOTP_UH(RTYPE, mult, cnst, out) \ 1293 { \ 1294 out = (RTYPE) __msa_dotp_u_w((v8u16) mult, (v8u16) cnst); \ 1295 } 1296 1297 #define MSA_DOTP_UH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 1298 { \ 1299 MSA_DOTP_UH(RTYPE, mult0, cnst0, out0) \ 1300 MSA_DOTP_UH(RTYPE, mult1, cnst1, out1) \ 1301 } 1302 1303 #define MSA_DOTP_UH4(RTYPE, mult0, mult1, mult2, mult3, \ 1304 cnst0, cnst1, cnst2, cnst3, \ 1305 out0, out1, out2, out3) \ 1306 { \ 1307 MSA_DOTP_UH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 1308 MSA_DOTP_UH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 1309 } 1310 1311 /* Description : Dot product of signed halfword vector elements. 1312 * Arguments : Inputs - mult 1313 * cnst 1314 * Outputs - out 1315 * Return Type - as per RTYPE 1316 * Details : Signed halfword elements from 'mult' are multiplied with 1317 * signed halfword elements from 'cnst' producing a result 1318 * twice the size of input i.e. signed word. 1319 * Then this multiplication results of adjacent odd-even elements 1320 * are added together and stored to the out vector. 1321 */ 1322 #define MSA_DOTP_SH(RTYPE, mult, cnst, out) \ 1323 { \ 1324 out = (RTYPE) __msa_dotp_s_w((v8i16) mult, (v8i16) cnst); \ 1325 } 1326 1327 #define MSA_DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 1328 { \ 1329 MSA_DOTP_SH(RTYPE, mult0, cnst0, out0) \ 1330 MSA_DOTP_SH(RTYPE, mult1, cnst1, out1) \ 1331 } 1332 1333 #define MSA_DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \ 1334 cnst0, cnst1, cnst2, cnst3, \ 1335 out0, out1, out2, out3) \ 1336 { \ 1337 MSA_DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 1338 MSA_DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 1339 } 1340 1341 /* Description : Dot product & addition of unsigned byte vector elements. 1342 * Arguments : Inputs - mult 1343 * cnst 1344 * Outputs - out 1345 * Return Type - as per RTYPE 1346 * Details : Unsigned byte elements from 'mult' are multiplied with 1347 * unsigned byte elements from 'cnst' producing a result 1348 * twice the size of input i.e. unsigned halfword. 1349 * Then this multiplication results of adjacent odd-even elements 1350 * are added to the out vector. 1351 */ 1352 #define MSA_DPADD_UB(RTYPE, mult, cnst, out) \ 1353 { \ 1354 out = (RTYPE) __msa_dpadd_u_h((v8u16) out, \ 1355 (v16u8) mult, (v16u8) cnst); \ 1356 } 1357 1358 #define MSA_DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 1359 { \ 1360 MSA_DPADD_UB(RTYPE, mult0, cnst0, out0) \ 1361 MSA_DPADD_UB(RTYPE, mult1, cnst1, out1) \ 1362 } 1363 1364 #define MSA_DPADD_UB4(RTYPE, mult0, mult1, mult2, mult3, \ 1365 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \ 1366 { \ 1367 MSA_DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 1368 MSA_DPADD_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 1369 } 1370 1371 /* Description : Dot product & addition of signed byte vector elements. 1372 * Arguments : Inputs - mult 1373 * cnst 1374 * Outputs - out 1375 * Return Type - as per RTYPE 1376 * Details : Signed byte elements from 'mult' are multiplied with 1377 * signed byte elements from 'cnst' producing a result 1378 * twice the size of input i.e. signed halfword. 1379 * Then this multiplication results of adjacent odd-even elements 1380 * are added to the out vector. 1381 */ 1382 #define MSA_DPADD_SB(RTYPE, mult, cnst, out) \ 1383 { \ 1384 out = (RTYPE) __msa_dpadd_s_h((v8i16) out, \ 1385 (v16i8) mult, (v16i8) cnst); \ 1386 } 1387 1388 #define MSA_DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 1389 { \ 1390 MSA_DPADD_SB(RTYPE, mult0, cnst0, out0) \ 1391 MSA_DPADD_SB(RTYPE, mult1, cnst1, out1) \ 1392 } 1393 1394 #define MSA_DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \ 1395 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \ 1396 { \ 1397 MSA_DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 1398 MSA_DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 1399 } 1400 1401 /* Description : Dot product & addition of unsigned halfword vector elements. 1402 * Arguments : Inputs - mult 1403 * cnst 1404 * Outputs - out 1405 * Return Type - as per RTYPE 1406 * Details : Unsigned halfword elements from 'mult' are multiplied with 1407 * unsigned halfword elements from 'cnst' producing a result 1408 * twice the size of input i.e. unsigned word. 1409 * Then this multiplication results of adjacent odd-even elements 1410 * are added to the out vector. 1411 */ 1412 #define MSA_DPADD_UH(RTYPE, mult, cnst, out) \ 1413 { \ 1414 out = (RTYPE) __msa_dpadd_u_w((v4u32) out, \ 1415 (v8u16) mult, (v8u16) cnst); \ 1416 } 1417 1418 #define MSA_DPADD_UH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 1419 { \ 1420 MSA_DPADD_UH(RTYPE, mult0, cnst0, out0) \ 1421 MSA_DPADD_UH(RTYPE, mult1, cnst1, out1) \ 1422 } 1423 1424 #define MSA_DPADD_UH4(RTYPE, mult0, mult1, mult2, mult3, \ 1425 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \ 1426 { \ 1427 MSA_DPADD_UH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 1428 MSA_DPADD_UH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 1429 } 1430 1431 /* Description : Dot product & addition of signed halfword vector elements. 1432 * Arguments : Inputs - mult 1433 * cnst 1434 * Outputs - out 1435 * Return Type - as per RTYPE 1436 * Details : Signed halfword elements from 'mult' are multiplied with 1437 * signed halfword elements from 'cnst' producing a result 1438 * twice the size of input i.e. signed word. 1439 * Then this multiplication results of adjacent odd-even elements 1440 * are added to the out vector. 1441 */ 1442 #define MSA_DPADD_SH(RTYPE, mult, cnst, out) \ 1443 { \ 1444 out = (RTYPE) __msa_dpadd_s_w((v4i32) out, \ 1445 (v8i16) mult, (v8i16) cnst); \ 1446 } 1447 1448 #define MSA_DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 1449 { \ 1450 MSA_DPADD_SH(RTYPE, mult0, cnst0, out0) \ 1451 MSA_DPADD_SH(RTYPE, mult1, cnst1, out1) \ 1452 } 1453 1454 #define MSA_DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \ 1455 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \ 1456 { \ 1457 MSA_DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 1458 MSA_DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 1459 } 1460 1461 /* Description : Clip all signed halfword elements of input vector between min & max. 1462 * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in)). 1463 * Arguments : Inputs - in (input vector) 1464 * - min (min threshold) 1465 * - max (max threshold) 1466 * Outputs - in (output vector with clipped elements) 1467 * Note : type of 'in' must be v8i16. 1468 */ 1469 #define MSA_CLIP_SH(in, min, max) \ 1470 { \ 1471 in = __msa_max_s_h((v8i16) min, (v8i16) in); \ 1472 in = __msa_min_s_h((v8i16) max, (v8i16) in); \ 1473 } 1474 1475 /* Description : Clip all signed halfword elements of input vector between 0 & 255. 1476 * Arguments : Inputs - in (input vector) 1477 * Outputs - in (output vector with clipped elements) 1478 * Note : type of 'in' must be v8i16. 1479 */ 1480 #define MSA_CLIP_SH_0_255(in) \ 1481 { \ 1482 in = __msa_maxi_s_h((v8i16) in, 0); \ 1483 in = (v8i16) __msa_sat_u_h((v8u16) in, 7); \ 1484 } 1485 1486 #define MSA_CLIP_SH2_0_255(in0, in1) \ 1487 { \ 1488 MSA_CLIP_SH_0_255(in0); \ 1489 MSA_CLIP_SH_0_255(in1); \ 1490 } 1491 1492 #define MSA_CLIP_SH4_0_255(in0, in1, in2, in3) \ 1493 { \ 1494 MSA_CLIP_SH2_0_255(in0, in1); \ 1495 MSA_CLIP_SH2_0_255(in2, in3); \ 1496 } 1497 1498 #define MSA_CLIP_SH8_0_255(in0, in1, in2, in3, \ 1499 in4, in5, in6, in7) \ 1500 { \ 1501 MSA_CLIP_SH4_0_255(in0, in1, in2, in3); \ 1502 MSA_CLIP_SH4_0_255(in4, in5, in6, in7); \ 1503 } 1504 1505 /* Description : Clip all signed word elements of input vector between 0 & 255. 1506 * Arguments : Inputs - in (input vector) 1507 * Outputs - in (output vector with clipped elements) 1508 * Note : type of 'in' must be v4i32. 1509 */ 1510 #define MSA_CLIP_SW_0_255(in) \ 1511 { \ 1512 in = __msa_maxi_s_w((v4i32) in, 0); \ 1513 in = (v4i32) __msa_sat_u_w((v4u32) in, 7); \ 1514 } 1515 1516 #define MSA_CLIP_SW2_0_255(in0, in1) \ 1517 { \ 1518 MSA_CLIP_SW_0_255(in0); \ 1519 MSA_CLIP_SW_0_255(in1); \ 1520 } 1521 1522 #define MSA_CLIP_SW4_0_255(in0, in1, in2, in3) \ 1523 { \ 1524 MSA_CLIP_SW2_0_255(in0, in1); \ 1525 MSA_CLIP_SW2_0_255(in2, in3); \ 1526 } 1527 1528 #define MSA_CLIP_SW8_0_255(in0, in1, in2, in3, \ 1529 in4, in5, in6, in7) \ 1530 { \ 1531 MSA_CLIP_SW4_0_255(in0, in1, in2, in3); \ 1532 MSA_CLIP_SW4_0_255(in4, in5, in6, in7); \ 1533 } 1534 1535 /* Description : Addition of 16 unsigned byte elements. 1536 * 16 unsigned byte elements of input vector are added 1537 * together and resulted integer sum is returned. 1538 * Arguments : Inputs - in (unsigned byte vector) 1539 * Outputs - sum_m (u32 sum) 1540 * Return Type - unsigned word 1541 */ 1542 #define MSA_HADD_UB_U32(in, sum_m) \ 1543 { \ 1544 v8u16 res_m; \ 1545 v4u32 res0_m; \ 1546 v2u64 res1_m, res2_m; \ 1547 \ 1548 res_m = __msa_hadd_u_h((v16u8) in, (v16u8) in); \ 1549 res0_m = __msa_hadd_u_w(res_m, res_m); \ 1550 res1_m = __msa_hadd_u_d(res0_m, res0_m); \ 1551 res2_m = (v2u64) __msa_splati_d((v2i64) res1_m, 1); \ 1552 res1_m += res2_m; \ 1553 sum_m = __msa_copy_u_w((v4i32) res1_m, 0); \ 1554 } 1555 1556 /* Description : Addition of 8 unsigned halfword elements. 1557 * 8 unsigned halfword elements of input vector are added 1558 * together and resulted integer sum is returned. 1559 * Arguments : Inputs - in (unsigned halfword vector) 1560 * Outputs - sum_m (u32 sum) 1561 * Return Type - unsigned word 1562 */ 1563 #define MSA_HADD_UH_U32(in, sum_m) \ 1564 { \ 1565 v4u32 res_m; \ 1566 v2u64 res0_m, res1_m; \ 1567 \ 1568 res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \ 1569 res0_m = __msa_hadd_u_d(res_m, res_m); \ 1570 res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \ 1571 res0_m += res1_m; \ 1572 sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \ 1573 } 1574 1575 /* Description : Addition of 4 unsigned word elements. 1576 * 4 unsigned word elements of input vector are added together and 1577 * resulted integer sum is returned. 1578 * Arguments : Inputs - in (unsigned word vector) 1579 * Outputs - sum_m (u32 sum) 1580 * Return Type - unsigned word 1581 */ 1582 #define MSA_HADD_UW_U32(in, sum_m) \ 1583 { \ 1584 v2u64 res0_m, res1_m; \ 1585 \ 1586 res0_m = __msa_hadd_u_d((v4u32) in, (v4u32) in); \ 1587 res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \ 1588 res0_m += res1_m; \ 1589 sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \ 1590 } 1591 1592 /* Description : Addition of 16 signed byte elements. 1593 * 16 signed byte elements of input vector are added 1594 * together and resulted integer sum is returned. 1595 * Arguments : Inputs - in (signed byte vector) 1596 * Outputs - sum_m (i32 sum) 1597 * Return Type - signed word 1598 */ 1599 #define MSA_HADD_SB_S32(in, sum_m) \ 1600 { \ 1601 v8i16 res_m; \ 1602 v4i32 res0_m; \ 1603 v2i64 res1_m, res2_m; \ 1604 \ 1605 res_m = __msa_hadd_s_h((v16i8) in, (v16i8) in); \ 1606 res0_m = __msa_hadd_s_w(res_m, res_m); \ 1607 res1_m = __msa_hadd_s_d(res0_m, res0_m); \ 1608 res2_m = __msa_splati_d(res1_m, 1); \ 1609 res1_m += res2_m; \ 1610 sum_m = __msa_copy_s_w((v4i32) res1_m, 0); \ 1611 } 1612 1613 /* Description : Addition of 8 signed halfword elements. 1614 * 8 signed halfword elements of input vector are added 1615 * together and resulted integer sum is returned. 1616 * Arguments : Inputs - in (signed halfword vector) 1617 * Outputs - sum_m (i32 sum) 1618 * Return Type - signed word 1619 */ 1620 #define MSA_HADD_SH_S32(in, sum_m) \ 1621 { \ 1622 v4i32 res_m; \ 1623 v2i64 res0_m, res1_m; \ 1624 \ 1625 res_m = __msa_hadd_s_w((v8i16) in, (v8i16) in); \ 1626 res0_m = __msa_hadd_s_d(res_m, res_m); \ 1627 res1_m = __msa_splati_d(res0_m, 1); \ 1628 res0_m += res1_m; \ 1629 sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \ 1630 } 1631 1632 /* Description : Addition of 4 signed word elements. 1633 * 4 signed word elements of input vector are added together and 1634 * resulted integer sum is returned. 1635 * Arguments : Inputs - in (signed word vector) 1636 * Outputs - sum_m (i32 sum) 1637 * Return Type - signed word 1638 */ 1639 #define MSA_HADD_SW_S32(in, sum_m) \ 1640 { \ 1641 v2i64 res0_m, res1_m; \ 1642 \ 1643 res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \ 1644 res1_m = __msa_splati_d(res0_m, 1); \ 1645 res0_m += res1_m; \ 1646 sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \ 1647 } 1648 1649 /* Description : Saturate the unsigned halfword element values to the max 1650 * unsigned value of (sat_val+1 bits). 1651 * The element data width remains unchanged. 1652 * Arguments : Inputs - in, sat_val 1653 * Outputs - in (in place) 1654 * Return Type - v8u16 1655 * Details : Each unsigned halfword element from 'in' is saturated to the 1656 * value generated with (sat_val+1) bit range. 1657 * Results are in placed to original vectors. 1658 */ 1659 #define MSA_SAT_UH(in, sat_val) \ 1660 { \ 1661 in = __msa_sat_u_h(in, sat_val); \ 1662 } 1663 1664 #define MSA_SAT_UH2(in0, in1, sat_val) \ 1665 { \ 1666 MSA_SAT_UH(in0, sat_val) \ 1667 MSA_SAT_UH(in1, sat_val) \ 1668 } 1669 1670 #define MSA_SAT_UH4(in0, in1, in2, in3, sat_val) \ 1671 { \ 1672 MSA_SAT_UH2(in0, in1, sat_val) \ 1673 MSA_SAT_UH2(in2, in3, sat_val) \ 1674 } 1675 1676 /* Description : Saturate the signed halfword element values to the max 1677 * signed value of (sat_val+1 bits). 1678 * The element data width remains unchanged. 1679 * Arguments : Inputs - in, sat_val 1680 * Outputs - in (in place) 1681 * Return Type - v8i16 1682 * Details : Each signed halfword element from 'in' is saturated to the 1683 * value generated with (sat_val+1) bit range. 1684 * Results are in placed to original vectors. 1685 */ 1686 #define MSA_SAT_SH(in, sat_val) \ 1687 { \ 1688 in = __msa_sat_s_h(in, sat_val); \ 1689 } 1690 1691 #define MSA_SAT_SH2(in0, in1, sat_val) \ 1692 { \ 1693 MSA_SAT_SH(in0, sat_val) \ 1694 MSA_SAT_SH(in1, sat_val) \ 1695 } 1696 1697 #define MSA_SAT_SH4(in0, in1, in2, in3, sat_val) \ 1698 { \ 1699 MSA_SAT_SH2(in0, in1, sat_val) \ 1700 MSA_SAT_SH2(in2, in3, sat_val) \ 1701 } 1702 1703 /* Description : Saturate the unsigned word element values to the max 1704 * unsigned value of (sat_val+1 bits). 1705 * The element data width remains unchanged. 1706 * Arguments : Inputs - in, sat_val 1707 * Outputs - in (in place) 1708 * Return Type - v4u32 1709 * Details : Each unsigned word element from 'in' is saturated to the 1710 * value generated with (sat_val+1) bit range. 1711 * Results are in placed to original vectors. 1712 */ 1713 #define MSA_SAT_UW(in, sat_val) \ 1714 { \ 1715 in = __msa_sat_u_w(in, sat_val); \ 1716 } 1717 1718 #define MSA_SAT_UW2(in0, in1, sat_val) \ 1719 { \ 1720 MSA_SAT_UW(in0, sat_val) \ 1721 MSA_SAT_UW(in1, sat_val) \ 1722 } 1723 1724 #define MSA_SAT_UW4(in0, in1, in2, in3, sat_val) \ 1725 { \ 1726 MSA_SAT_UW2(in0, in1, sat_val) \ 1727 MSA_SAT_UW2(in2, in3, sat_val) \ 1728 } 1729 1730 /* Description : Saturate the signed word element values to the max 1731 * signed value of (sat_val+1 bits). 1732 * The element data width remains unchanged. 1733 * Arguments : Inputs - in, sat_val 1734 * Outputs - in (in place) 1735 * Return Type - v4i32 1736 * Details : Each signed word element from 'in' is saturated to the 1737 * value generated with (sat_val+1) bit range. 1738 * Results are in placed to original vectors. 1739 */ 1740 #define MSA_SAT_SW(in, sat_val) \ 1741 { \ 1742 in = __msa_sat_s_w(in, sat_val); \ 1743 } 1744 1745 #define MSA_SAT_SW2(in0, in1, sat_val) \ 1746 { \ 1747 MSA_SAT_SW(in0, sat_val) \ 1748 MSA_SAT_SW(in1, sat_val) \ 1749 } 1750 1751 #define MSA_SAT_SW4(in0, in1, in2, in3, sat_val) \ 1752 { \ 1753 MSA_SAT_SW2(in0, in1, sat_val) \ 1754 MSA_SAT_SW2(in2, in3, sat_val) \ 1755 } 1756 1757 /* Description : Each byte element is logically xor'ed with immediate 128. 1758 * Arguments : Inputs - in 1759 * Outputs - in (in-place) 1760 * Return Type - as per RTYPE 1761 * Details : Each unsigned byte element from input vector 'in' is 1762 * logically xor'ed with 128 and result is in-place stored in 1763 * 'in' vector. 1764 */ 1765 #define MSA_XORI_B_128(RTYPE, in) \ 1766 { \ 1767 in = (RTYPE) __msa_xori_b((v16u8) in, 128); \ 1768 } 1769 1770 #define MSA_XORI_B2_128(RTYPE, in0, in1) \ 1771 { \ 1772 MSA_XORI_B_128(RTYPE, in0); \ 1773 MSA_XORI_B_128(RTYPE, in1); \ 1774 } 1775 1776 #define MSA_XORI_B4_128(RTYPE, in0, in1, in2, in3) \ 1777 { \ 1778 MSA_XORI_B2_128(RTYPE, in0, in1); \ 1779 MSA_XORI_B2_128(RTYPE, in2, in3); \ 1780 } 1781 1782 /* Description : Shift right logical all byte elements of vector. 1783 * Arguments : Inputs - in, shift 1784 * Outputs - in (in place) 1785 * Return Type - as per RTYPE 1786 * Details : Each element of vector 'in' is shifted right logical by 1787 * number of bits respective element holds in vector 'shift' and 1788 * result is in place written to 'in'. 1789 * Here, 'shift' is a vector passed in. 1790 */ 1791 #define MSA_SRL_B(RTYPE, in, shift) \ 1792 { \ 1793 in = (RTYPE) __msa_srl_b((v16i8) in, (v16i8) shift); \ 1794 } 1795 1796 #define MSA_SRL_B2(RTYPE, in0, in1, shift) \ 1797 { \ 1798 MSA_SRL_B(RTYPE, in0, shift); \ 1799 MSA_SRL_B(RTYPE, in1, shift); \ 1800 } 1801 1802 #define MSA_SRL_B4(RTYPE, in0, in1, in2, in3, shift) \ 1803 { \ 1804 MSA_SRL_B2(RTYPE, in0, in1, shift); \ 1805 MSA_SRL_B2(RTYPE, in2, in3, shift); \ 1806 } 1807 1808 /* Description : Shift right logical all halfword elements of vector. 1809 * Arguments : Inputs - in, shift 1810 * Outputs - in (in place) 1811 * Return Type - as per RTYPE 1812 * Details : Each element of vector 'in' is shifted right logical by 1813 * number of bits respective element holds in vector 'shift' and 1814 * result is in place written to 'in'. 1815 * Here, 'shift' is a vector passed in. 1816 */ 1817 #define MSA_SRL_H(RTYPE, in, shift) \ 1818 { \ 1819 in = (RTYPE) __msa_srl_h((v8i16) in, (v8i16) shift); \ 1820 } 1821 1822 #define MSA_SRL_H2(RTYPE, in0, in1, shift) \ 1823 { \ 1824 MSA_SRL_H(RTYPE, in0, shift); \ 1825 MSA_SRL_H(RTYPE, in1, shift); \ 1826 } 1827 1828 #define MSA_SRL_H4(RTYPE, in0, in1, in2, in3, shift) \ 1829 { \ 1830 MSA_SRL_H2(RTYPE, in0, in1, shift); \ 1831 MSA_SRL_H2(RTYPE, in2, in3, shift); \ 1832 } 1833 1834 /* Description : Shift right logical all word elements of vector. 1835 * Arguments : Inputs - in, shift 1836 * Outputs - in (in place) 1837 * Return Type - as per RTYPE 1838 * Details : Each element of vector 'in' is shifted right logical by 1839 * number of bits respective element holds in vector 'shift' and 1840 * result is in place written to 'in'. 1841 * Here, 'shift' is a vector passed in. 1842 */ 1843 #define MSA_SRL_W(RTYPE, in, shift) \ 1844 { \ 1845 in = (RTYPE) __msa_srl_w((v4i32) in, (v4i32) shift); \ 1846 } 1847 1848 #define MSA_SRL_W2(RTYPE, in0, in1, shift) \ 1849 { \ 1850 MSA_SRL_W(RTYPE, in0, shift); \ 1851 MSA_SRL_W(RTYPE, in1, shift); \ 1852 } 1853 1854 #define MSA_SRL_W4(RTYPE, in0, in1, in2, in3, shift) \ 1855 { \ 1856 MSA_SRL_W2(RTYPE, in0, in1, shift); \ 1857 MSA_SRL_W2(RTYPE, in2, in3, shift); \ 1858 } 1859 1860 /* Description : Shift right logical all double word elements of vector. 1861 * Arguments : Inputs - in, shift 1862 * Outputs - in (in place) 1863 * Return Type - as per RTYPE 1864 * Details : Each element of vector 'in' is shifted right logical by 1865 * number of bits respective element holds in vector 'shift' and 1866 * result is in place written to 'in'. 1867 * Here, 'shift' is a vector passed in. 1868 */ 1869 #define MSA_SRL_D(RTYPE, in, shift) \ 1870 { \ 1871 in = (RTYPE) __msa_srl_d((v2i64) in, (v2i64) shift); \ 1872 } 1873 1874 #define MSA_SRL_D2(RTYPE, in0, in1, shift) \ 1875 { \ 1876 MSA_SRL_D(RTYPE, in0, shift); \ 1877 MSA_SRL_D(RTYPE, in1, shift); \ 1878 } 1879 1880 #define MSA_SRL_D4(RTYPE, in0, in1, in2, in3, shift) \ 1881 { \ 1882 MSA_SRL_D2(RTYPE, in0, in1, shift); \ 1883 MSA_SRL_D2(RTYPE, in2, in3, shift); \ 1884 } 1885 1886 /* Description : Shift right logical rounded all byte elements of vector. 1887 * Arguments : Inputs - in, shift 1888 * Outputs - in (in place) 1889 * Return Type - as per RTYPE 1890 * Details : Each element of vector 'in' is shifted right logical rounded 1891 * by number of bits respective element holds in vector 'shift' 1892 * and result is in place written to 'in'. 1893 * Here, 'shift' is a vector passed in. 1894 */ 1895 #define MSA_SRLR_B(RTYPE, in, shift) \ 1896 { \ 1897 in = (RTYPE) __msa_srlr_b((v16i8) in, (v16i8) shift); \ 1898 } 1899 1900 #define MSA_SRLR_B2(RTYPE, in0, in1, shift) \ 1901 { \ 1902 MSA_SRLR_B(RTYPE, in0, shift); \ 1903 MSA_SRLR_B(RTYPE, in1, shift); \ 1904 } 1905 1906 #define MSA_SRLR_B4(RTYPE, in0, in1, in2, in3, shift) \ 1907 { \ 1908 MSA_SRLR_B2(RTYPE, in0, in1, shift); \ 1909 MSA_SRLR_B2(RTYPE, in2, in3, shift); \ 1910 } 1911 1912 /* Description : Shift right logical rounded all halfword elements of vector. 1913 * Arguments : Inputs - in, shift 1914 * Outputs - in (in place) 1915 * Return Type - as per RTYPE 1916 * Details : Each element of vector 'in' is shifted right logical rounded 1917 * by number of bits respective element holds in vector 'shift' 1918 * and result is in place written to 'in'. 1919 * Here, 'shift' is a vector passed in. 1920 */ 1921 #define MSA_SRLR_H(RTYPE, in, shift) \ 1922 { \ 1923 in = (RTYPE) __msa_srlr_h((v8i16) in, (v8i16) shift); \ 1924 } 1925 1926 #define MSA_SRLR_H2(RTYPE, in0, in1, shift) \ 1927 { \ 1928 MSA_SRLR_H(RTYPE, in0, shift); \ 1929 MSA_SRLR_H(RTYPE, in1, shift); \ 1930 } 1931 1932 #define MSA_SRLR_H4(RTYPE, in0, in1, in2, in3, shift) \ 1933 { \ 1934 MSA_SRLR_H2(RTYPE, in0, in1, shift); \ 1935 MSA_SRLR_H2(RTYPE, in2, in3, shift); \ 1936 } 1937 1938 /* Description : Shift right logical rounded all word elements of vector. 1939 * Arguments : Inputs - in, shift 1940 * Outputs - in (in place) 1941 * Return Type - as per RTYPE 1942 * Details : Each element of vector 'in' is shifted right logical rounded 1943 * by number of bits respective element holds in vector 'shift' 1944 * and result is in place written to 'in'. 1945 * Here, 'shift' is a vector passed in. 1946 */ 1947 #define MSA_SRLR_W(RTYPE, in, shift) \ 1948 { \ 1949 in = (RTYPE) __msa_srlr_w((v4i32) in, (v4i32) shift); \ 1950 } 1951 1952 #define MSA_SRLR_W2(RTYPE, in0, in1, shift) \ 1953 { \ 1954 MSA_SRLR_W(RTYPE, in0, shift); \ 1955 MSA_SRLR_W(RTYPE, in1, shift); \ 1956 } 1957 1958 #define MSA_SRLR_W4(RTYPE, in0, in1, in2, in3, shift) \ 1959 { \ 1960 MSA_SRLR_W2(RTYPE, in0, in1, shift); \ 1961 MSA_SRLR_W2(RTYPE, in2, in3, shift); \ 1962 } 1963 1964 /* Description : Shift right logical rounded all double word elements of vector. 1965 * Arguments : Inputs - in, shift 1966 * Outputs - in (in place) 1967 * Return Type - as per RTYPE 1968 * Details : Each element of vector 'in' is shifted right logical rounded 1969 * by number of bits respective element holds in vector 'shift' 1970 * and result is in place written to 'in'. 1971 * Here, 'shift' is a vector passed in. 1972 */ 1973 #define MSA_SRLR_D(RTYPE, in, shift) \ 1974 { \ 1975 in = (RTYPE) __msa_srlr_d((v2i64) in, (v2i64) shift); \ 1976 } 1977 1978 #define MSA_SRLR_D2(RTYPE, in0, in1, shift) \ 1979 { \ 1980 MSA_SRLR_D(RTYPE, in0, shift); \ 1981 MSA_SRLR_D(RTYPE, in1, shift); \ 1982 } 1983 1984 #define MSA_SRLR_D4(RTYPE, in0, in1, in2, in3, shift) \ 1985 { \ 1986 MSA_SRLR_D2(RTYPE, in0, in1, shift); \ 1987 MSA_SRLR_D2(RTYPE, in2, in3, shift); \ 1988 } 1989 1990 /* Description : Shift right arithmetic rounded all byte elements of vector. 1991 * Arguments : Inputs - in, shift 1992 * Outputs - in (in place) 1993 * Return Type - as per RTYPE 1994 * Details : Each element of vector 'in' is shifted right arithmetic 1995 * rounded by number of bits respective element holds in 1996 * vector 'shift' and result is in place written to 'in'. 1997 * Here, 'shift' is a vector passed in. 1998 */ 1999 #define MSA_SRAR_B(RTYPE, in, shift) \ 2000 { \ 2001 in = (RTYPE) __msa_srar_b((v16i8) in, (v16i8) shift); \ 2002 } 2003 2004 #define MSA_SRAR_B2(RTYPE, in0, in1, shift) \ 2005 { \ 2006 MSA_SRAR_B(RTYPE, in0, shift); \ 2007 MSA_SRAR_B(RTYPE, in1, shift); \ 2008 } 2009 2010 #define MSA_SRAR_B4(RTYPE, in0, in1, in2, in3, shift) \ 2011 { \ 2012 MSA_SRAR_B2(RTYPE, in0, in1, shift); \ 2013 MSA_SRAR_B2(RTYPE, in2, in3, shift); \ 2014 } 2015 2016 /* Description : Shift right arithmetic rounded all halfword elements of vector. 2017 * Arguments : Inputs - in, shift 2018 * Outputs - in (in place) 2019 * Return Type - as per RTYPE 2020 * Details : Each element of vector 'in' is shifted right arithmetic 2021 * rounded by number of bits respective element holds in 2022 * vector 'shift' and result is in place written to 'in'. 2023 * Here, 'shift' is a vector passed in. 2024 */ 2025 #define MSA_SRAR_H(RTYPE, in, shift) \ 2026 { \ 2027 in = (RTYPE) __msa_srar_h((v8i16) in, (v8i16) shift); \ 2028 } 2029 2030 #define MSA_SRAR_H2(RTYPE, in0, in1, shift) \ 2031 { \ 2032 MSA_SRAR_H(RTYPE, in0, shift); \ 2033 MSA_SRAR_H(RTYPE, in1, shift); \ 2034 } 2035 2036 #define MSA_SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \ 2037 { \ 2038 MSA_SRAR_H2(RTYPE, in0, in1, shift); \ 2039 MSA_SRAR_H2(RTYPE, in2, in3, shift); \ 2040 } 2041 2042 /* Description : Shift right arithmetic rounded all word elements of vector. 2043 * Arguments : Inputs - in, shift 2044 * Outputs - in (in place) 2045 * Return Type - as per RTYPE 2046 * Details : Each element of vector 'in' is shifted right arithmetic 2047 * rounded by number of bits respective element holds in 2048 * vector 'shift' and result is in place written to 'in'. 2049 * Here, 'shift' is a vector passed in. 2050 */ 2051 #define MSA_SRAR_W(RTYPE, in, shift) \ 2052 { \ 2053 in = (RTYPE) __msa_srar_w((v4i32) in, (v4i32) shift); \ 2054 } 2055 2056 #define MSA_SRAR_W2(RTYPE, in0, in1, shift) \ 2057 { \ 2058 MSA_SRAR_W(RTYPE, in0, shift); \ 2059 MSA_SRAR_W(RTYPE, in1, shift); \ 2060 } 2061 2062 #define MSA_SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ 2063 { \ 2064 MSA_SRAR_W2(RTYPE, in0, in1, shift); \ 2065 MSA_SRAR_W2(RTYPE, in2, in3, shift); \ 2066 } 2067 2068 /* Description : Shift right arithmetic rounded all double word elements 2069 * of vector. 2070 * Arguments : Inputs - in, shift 2071 * Outputs - in (in place) 2072 * Return Type - as per RTYPE 2073 * Details : Each element of vector 'in' is shifted right arithmetic 2074 * rounded by number of bits respective element holds in 2075 * vector 'shift' and result is in place written to 'in'. 2076 * Here, 'shift' is a vector passed in. 2077 */ 2078 #define MSA_SRAR_D(RTYPE, in, shift) \ 2079 { \ 2080 in = (RTYPE) __msa_srar_d((v2i64) in, (v2i64) shift); \ 2081 } 2082 2083 #define MSA_SRAR_D2(RTYPE, in0, in1, shift) \ 2084 { \ 2085 MSA_SRAR_D(RTYPE, in0, shift); \ 2086 MSA_SRAR_D(RTYPE, in1, shift); \ 2087 } 2088 2089 #define MSA_SRAR_D4(RTYPE, in0, in1, in2, in3, shift) \ 2090 { \ 2091 MSA_SRAR_D2(RTYPE, in0, in1, shift); \ 2092 MSA_SRAR_D2(RTYPE, in2, in3, shift); \ 2093 } 2094 2095 /* Description : Shift right arithmetic rounded all byte elements of vector. 2096 * Arguments : Inputs - in, shift 2097 * Outputs - in (in place) 2098 * Return Type - as per RTYPE 2099 * Details : Each element of vector 'in' is shifted right arithmetic 2100 * rounded by number of bits respective element holds in vector 2101 * 'shift' and result is in place written to 'in'. 2102 * Here, 'shift' is a immediate number passed in. 2103 */ 2104 #define MSA_SRARI_B(RTYPE, in, shift) \ 2105 { \ 2106 in = (RTYPE) __msa_srari_b((v16i8) in, (v16i8) shift); \ 2107 } 2108 2109 #define MSA_SRARI_B2(RTYPE, in0, in1, shift) \ 2110 { \ 2111 MSA_SRARI_B(RTYPE, in0, shift); \ 2112 MSA_SRARI_B(RTYPE, in1, shift); \ 2113 } 2114 2115 #define MSA_SRARI_B4(RTYPE, in0, in1, in2, in3, shift) \ 2116 { \ 2117 MSA_SRARI_B2(RTYPE, in0, in1, shift); \ 2118 MSA_SRARI_B2(RTYPE, in2, in3, shift); \ 2119 } 2120 2121 /* Description : Shift right arithmetic rounded all halfword elements of vector. 2122 * Arguments : Inputs - in, shift 2123 * Outputs - in (in place) 2124 * Return Type - as per RTYPE 2125 * Details : Each element of vector 'in' is shifted right arithmetic 2126 * rounded by number of bits respective element holds in vector 2127 * 'shift' and result is in place written to 'in'. 2128 * Here, 'shift' is a immediate number passed in. 2129 */ 2130 #define MSA_SRARI_H(RTYPE, in, shift) \ 2131 { \ 2132 in = (RTYPE) __msa_srari_h((v8i16) in, (v8i16) shift); \ 2133 } 2134 2135 #define MSA_SRARI_H2(RTYPE, in0, in1, shift) \ 2136 { \ 2137 MSA_SRARI_H(RTYPE, in0, shift); \ 2138 MSA_SRARI_H(RTYPE, in1, shift); \ 2139 } 2140 2141 #define MSA_SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ 2142 { \ 2143 MSA_SRARI_H2(RTYPE, in0, in1, shift); \ 2144 MSA_SRARI_H2(RTYPE, in2, in3, shift); \ 2145 } 2146 2147 /* Description : Shift right arithmetic rounded all word elements of vector. 2148 * Arguments : Inputs - in, shift 2149 * Outputs - in (in place) 2150 * Return Type - as per RTYPE 2151 * Details : Each element of vector 'in' is shifted right arithmetic 2152 * rounded by number of bits respective element holds in vector 2153 * 'shift' and result is in place written to 'in'. 2154 * Here, 'shift' is a immediate number passed in. 2155 */ 2156 #define MSA_SRARI_W(RTYPE, in, shift) \ 2157 { \ 2158 in = (RTYPE) __msa_srari_w((v4i32) in, (v4i32) shift); \ 2159 } 2160 2161 #define MSA_SRARI_W2(RTYPE, in0, in1, shift) \ 2162 { \ 2163 MSA_SRARI_W(RTYPE, in0, shift); \ 2164 MSA_SRARI_W(RTYPE, in1, shift); \ 2165 } 2166 2167 #define MSA_SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ 2168 { \ 2169 MSA_SRARI_W2(RTYPE, in0, in1, shift); \ 2170 MSA_SRARI_W2(RTYPE, in2, in3, shift); \ 2171 } 2172 2173 /* Description : Shift right arithmetic rounded all double word elements 2174 * of vector. 2175 * Arguments : Inputs - in, shift 2176 * Outputs - in (in place) 2177 * Return Type - as per RTYPE 2178 * Details : Each element of vector 'in' is shifted right arithmetic 2179 * rounded by number of bits respective element holds in 2180 * vector 'shift' and result is in place written to 'in'. 2181 * Here, 'shift' is a immediate number passed in. 2182 */ 2183 #define MSA_SRARI_D(RTYPE, in, shift) \ 2184 { \ 2185 in = (RTYPE) __msa_srari_d((v2i64) in, (v2i64) shift); \ 2186 } 2187 2188 #define MSA_SRARI_D2(RTYPE, in0, in1, shift) \ 2189 { \ 2190 MSA_SRARI_D(RTYPE, in0, shift); \ 2191 MSA_SRARI_D(RTYPE, in1, shift); \ 2192 } 2193 2194 #define MSA_SRARI_D4(RTYPE, in0, in1, in2, in3, shift) \ 2195 { \ 2196 MSA_SRARI_D2(RTYPE, in0, in1, shift); \ 2197 MSA_SRARI_D2(RTYPE, in2, in3, shift); \ 2198 } 2199 2200 /* Description : Transposes input 4x4 byte block. 2201 * Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block) 2202 * Outputs - out0, out1, out2, out3 (output 4x4 byte block) 2203 * Return Type - RTYPE 2204 * Details : 2205 */ 2206 #define MSA_TRANSPOSE4x4_B(RTYPE, in0, in1, in2, in3, \ 2207 out0, out1, out2, out3) \ 2208 { \ 2209 v16i8 zero_m = { 0 }; \ 2210 \ 2211 MSA_ILVR_B2(RTYPE, in2, in0, in3, in1, out2, out3); \ 2212 out0 = (RTYPE) __msa_ilvr_b((v16i8) out3, (v16i8) out2); \ 2213 out1 = (RTYPE) __msa_sldi_b(zero_m, (v16i8) out0, 4); \ 2214 out2 = (RTYPE) __msa_sldi_b(zero_m, (v16i8) out1, 4); \ 2215 out3 = (RTYPE) __msa_sldi_b(zero_m, (v16i8) out2, 4); \ 2216 } 2217 2218 /* Description : Transposes input 8x4 byte block into 4x8. 2219 * Arguments : Inputs - in0, in1, in2 ~ in7 (input 8x4 byte block) 2220 * Outputs - out0, out1, out2, out3 (output 4x8 byte block) 2221 * Return Type - RTYPE 2222 * Details : 2223 */ 2224 #define MSA_TRANSPOSE8x4_B(RTYPE, in0, in1, in2, in3, in4, in5, \ 2225 in6, in7, out0, out1, out2, out3) \ 2226 { \ 2227 v16i8 zero_m = { 0 }; \ 2228 \ 2229 MSA_ILVR_B4(RTYPE, in2, in0, in3, in1, in6, in4, in7, in5, \ 2230 out0, out1, out2, out3); \ 2231 MSA_ILVR_H2(RTYPE, out2, out0, out3, out1, out2, out3); \ 2232 out0 = (RTYPE) __msa_ilvr_b((v16i8) out3, (v16i8) out2); \ 2233 out1 = (RTYPE) __msa_sldi_b(zero_m, (v16i8) out0, 8); \ 2234 out2 = (RTYPE) __msa_ilvl_b((v16i8) out3, (v16i8) out2); \ 2235 out3 = (RTYPE) __msa_sldi_b(zero_m, (v16i8) out2, 8); \ 2236 } 2237 2238 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors. 2239 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 2240 * in8, in9, in10, in11, in12, in13, in14, in15 2241 * Outputs - out0, out1, out2, out3 2242 * Return Type - RTYPE 2243 * Details : 2244 */ 2245 #define MSA_TRANSPOSE16x4_B(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 2246 in8, in9, in10, in11, in12, in13, in14, in15, \ 2247 out0, out1, out2, out3) \ 2248 { \ 2249 v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2250 \ 2251 MSA_ILVR_B4(RTYPE, in2, in0, in3, in1, in6, in4, in7, in5, \ 2252 out0, out1, out2, out3); \ 2253 MSA_ILVR_H2(RTYPE, out2, out0, out3, out1, out2, out3); \ 2254 MSA_ILVRL_B2(v2i64, out3, out2, tmp0_m, tmp1_m); \ 2255 \ 2256 MSA_ILVR_B4(RTYPE, in10, in8, in11, in9, in14, in12, in15, in13, \ 2257 out0, out1, out2, out3); \ 2258 MSA_ILVR_H2(RTYPE, out2, out0, out3, out1, out2, out3); \ 2259 MSA_ILVRL_B2(v2i64, out3, out2, tmp2_m, tmp3_m); \ 2260 \ 2261 MSA_ILVRL_D4(RTYPE, tmp2_m, tmp0_m, tmp3_m, tmp1_m, \ 2262 out0, out1, out2, out3); \ 2263 } 2264 2265 /* Description : Transposes input 8x8 byte block. 2266 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 2267 * (input 8x8 byte block) 2268 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 2269 * (output 8x8 byte block) 2270 * Return Type - RTYPE 2271 * Details : 2272 */ 2273 #define MSA_TRANSPOSE8x8_B(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 2274 out0, out1, out2, out3, out4, out5, out6, out7) \ 2275 { \ 2276 v16i8 zero_m = {0}; \ 2277 \ 2278 MSA_ILVR_B4(RTYPE, in2, in0, in3, in1, in6, in4, in7, in5, \ 2279 out0, out1, out2, out3); \ 2280 MSA_ILVRL_B4(RTYPE, out1, out0, out3, out2, out4, out5, out6, out7); \ 2281 MSA_ILVRL_W4(RTYPE, out6, out4, out7, out5, out0, out2, out4, out6); \ 2282 out1 = (RTYPE) __msa_sldi_b(zero_m, (v16i8) out0, 8); \ 2283 out3 = (RTYPE) __msa_sldi_b(zero_m, (v16i8) out2, 8); \ 2284 out5 = (RTYPE) __msa_sldi_b(zero_m, (v16i8) out4, 8); \ 2285 out7 = (RTYPE) __msa_sldi_b(zero_m, (v16i8) out6, 8); \ 2286 } 2287 2288 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors. 2289 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 2290 * in8, in9, in10, in11, in12, in13, in14, in15 2291 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 2292 * Return Type - RTYPE 2293 * Details : 2294 */ 2295 #define MSA_TRANSPOSE16x8_B(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 2296 in8, in9, in10, in11, in12, in13, in14, in15, \ 2297 out0, out1, out2, out3, out4, out5, out6, out7) \ 2298 { \ 2299 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2300 \ 2301 MSA_ILVEV_D4(RTYPE, in8, in0, in9, in1, in10, in2, in11, in3, \ 2302 out7, out6, out5, out4); \ 2303 MSA_ILVEV_D4(RTYPE, in12, in4, in13, in5, in14, in6, in15, in7, \ 2304 out3, out2, out1, out0); \ 2305 \ 2306 tmp0_m = __msa_ilvev_b((v16i8) out6, (v16i8) out7); \ 2307 tmp1_m = __msa_ilvod_b((v16i8) out6, (v16i8) out7); \ 2308 out6 = (RTYPE) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \ 2309 out5 = (RTYPE) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \ 2310 tmp2_m = __msa_ilvev_b((v16i8) out2, (v16i8) out3); \ 2311 tmp3_m = __msa_ilvod_b((v16i8) out2, (v16i8) out3); \ 2312 out2 = (RTYPE) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \ 2313 out1 = (RTYPE) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \ 2314 \ 2315 MSA_ILVEV_H2(RTYPE, out6, tmp0_m, out2, tmp2_m, out3, out7); \ 2316 out0 = (RTYPE) __msa_ilvev_w((v4i32) out7, (v4i32) out3); \ 2317 out4 = (RTYPE) __msa_ilvod_w((v4i32) out7, (v4i32) out3); \ 2318 \ 2319 MSA_ILVOD_H2(RTYPE, out6, tmp0_m, out2, tmp2_m, out3, out7); \ 2320 out2 = (RTYPE) __msa_ilvev_w((v4i32) out7, (v4i32) out3); \ 2321 out6 = (RTYPE) __msa_ilvod_w((v4i32) out7, (v4i32) out3); \ 2322 \ 2323 MSA_ILVOD_H2(v16i8, out5, tmp1_m, out1, tmp3_m, tmp0_m, tmp2_m); \ 2324 out3 = (RTYPE) __msa_ilvev_w((v4i32) tmp2_m, (v4i32) tmp0_m); \ 2325 out7 = (RTYPE) __msa_ilvod_w((v4i32) tmp2_m, (v4i32) tmp0_m); \ 2326 \ 2327 MSA_ILVEV_H2(v16i8, out5, tmp1_m, out1, tmp3_m, tmp0_m, tmp2_m); \ 2328 out1 = (RTYPE) __msa_ilvev_w((v4i32) tmp2_m, (v4i32) tmp0_m); \ 2329 out5 = (RTYPE) __msa_ilvod_w((v4i32) tmp2_m, (v4i32) tmp0_m); \ 2330 } 2331 2332 /* Description : Transposes 4x4 block with half word elements in vectors. 2333 * Arguments : Inputs - in0, in1, in2, in3 2334 * Outputs - out0, out1, out2, out3 2335 * Return Type - RTYPE 2336 * Details : 2337 */ 2338 #define MSA_TRANSPOSE4x4_H(RTYPE, in0, in1, in2, in3, \ 2339 out0, out1, out2, out3) \ 2340 { \ 2341 MSA_ILVR_H2(RTYPE, in1, in0, in3, in2, out1, out3); \ 2342 MSA_ILVRL_W2(RTYPE, out3, out1, out0, out2); \ 2343 MSA_ILVL_D2(RTYPE, out0, out0, out2, out2, out1, out3); \ 2344 } 2345 2346 /* Description : Transposes 8x4 block with half word elements in vectors. 2347 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 2348 * Outputs - out0, out1, out2, out3 2349 * Return Type - RTYPE 2350 * Details : 2351 */ 2352 #define MSA_TRANSPOSE8x4_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 2353 out0, out1, out2, out3) \ 2354 { \ 2355 v8i16 s0_m, s1_m; \ 2356 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2357 \ 2358 MSA_ILVR_H2(v8i16, in6, in4, in7, in5, s0_m, s1_m); \ 2359 MSA_ILVRL_H2(v8i16, s1_m, s0_m, tmp0_m, tmp1_m); \ 2360 MSA_ILVR_H2(v8i16, in2, in0, in3, in1, s0_m, s1_m); \ 2361 MSA_ILVRL_H2(v8i16, s1_m, s0_m, tmp2_m, tmp3_m); \ 2362 MSA_PCKEV_D2(RTYPE, tmp0_m, tmp2_m, tmp1_m, tmp3_m, out0, out2); \ 2363 MSA_PCKOD_D2(RTYPE, tmp0_m, tmp2_m, tmp1_m, tmp3_m, out1, out3); \ 2364 } 2365 2366 /* Description : Transposes 8x8 block with half word elements in vectors. 2367 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 2368 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 2369 * Return Type - RTYPE 2370 * Details : 2371 */ 2372 #define MSA_TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 2373 out0, out1, out2, out3, out4, out5, out6, out7) \ 2374 { \ 2375 v8i16 s0_m, s1_m; \ 2376 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2377 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 2378 \ 2379 MSA_ILVR_H2(v8i16, in6, in4, in7, in5, s0_m, s1_m); \ 2380 MSA_ILVRL_H2(v8i16, s1_m, s0_m, tmp0_m, tmp1_m); \ 2381 MSA_ILVL_H2(v8i16, in6, in4, in7, in5, s0_m, s1_m); \ 2382 MSA_ILVRL_H2(v8i16, s1_m, s0_m, tmp2_m, tmp3_m); \ 2383 MSA_ILVR_H2(v8i16, in2, in0, in3, in1, s0_m, s1_m); \ 2384 MSA_ILVRL_H2(v8i16, s1_m, s0_m, tmp4_m, tmp5_m); \ 2385 MSA_ILVL_H2(v8i16, in2, in0, in3, in1, s0_m, s1_m); \ 2386 MSA_ILVRL_H2(v8i16, s1_m, s0_m, tmp6_m, tmp7_m); \ 2387 MSA_PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \ 2388 tmp3_m, tmp7_m, out0, out2, out4, out6); \ 2389 MSA_PCKOD_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \ 2390 tmp3_m, tmp7_m, out1, out3, out5, out7); \ 2391 } 2392 2393 #endif /* _MSA_MACROS_H */ 2394