1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_DSP_MIPS_MACROS_MSA_H_ 12 #define VPX_DSP_MIPS_MACROS_MSA_H_ 13 14 #include <msa.h> 15 16 #include "./vpx_config.h" 17 #include "vpx/vpx_integer.h" 18 19 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) 20 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__) 21 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__) 22 23 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) 24 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__) 25 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__) 26 27 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) 28 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__) 29 30 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 31 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) 32 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__) 33 34 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 35 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__) 36 37 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 38 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__) 39 40 #if (__mips_isa_rev >= 6) 41 #define LH(psrc) \ 42 ({ \ 43 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 44 uint16_t val_m; \ 45 \ 46 __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \ 47 \ 48 : [val_m] "=r"(val_m) \ 49 : [psrc_m] "m"(*psrc_m)); \ 50 \ 51 val_m; \ 52 }) 53 54 #define LW(psrc) \ 55 ({ \ 56 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 57 uint32_t val_m; \ 58 \ 59 __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \ 60 \ 61 : [val_m] "=r"(val_m) \ 62 : [psrc_m] "m"(*psrc_m)); \ 63 \ 64 val_m; \ 65 }) 66 67 #if (__mips == 64) 68 #define LD(psrc) \ 69 ({ \ 70 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 71 uint64_t val_m = 0; \ 72 \ 73 __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \ 74 \ 75 : [val_m] "=r"(val_m) \ 76 : [psrc_m] "m"(*psrc_m)); \ 77 \ 78 val_m; \ 79 }) 80 #else // !(__mips == 64) 81 #define LD(psrc) \ 82 ({ \ 83 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 84 uint32_t val0_m, val1_m; \ 85 uint64_t val_m = 0; \ 86 \ 87 val0_m = LW(psrc_m); \ 88 val1_m = LW(psrc_m + 4); \ 89 \ 90 val_m = (uint64_t)(val1_m); \ 91 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ 92 val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ 93 \ 94 val_m; \ 95 }) 96 #endif // (__mips == 64) 97 98 #define SH(val, pdst) \ 99 { \ 100 uint8_t *pdst_m = (uint8_t *)(pdst); \ 101 const uint16_t val_m = (val); \ 102 \ 103 __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \ 104 \ 105 : [pdst_m] "=m"(*pdst_m) \ 106 : [val_m] "r"(val_m)); \ 107 } 108 109 #define SW(val, pdst) \ 110 { \ 111 uint8_t *pdst_m = (uint8_t *)(pdst); \ 112 const uint32_t val_m = (val); \ 113 \ 114 __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \ 115 \ 116 : [pdst_m] "=m"(*pdst_m) \ 117 : [val_m] "r"(val_m)); \ 118 } 119 120 #define SD(val, pdst) \ 121 { \ 122 uint8_t *pdst_m = (uint8_t *)(pdst); \ 123 const uint64_t val_m = (val); \ 124 \ 125 __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \ 126 \ 127 : [pdst_m] "=m"(*pdst_m) \ 128 : [val_m] "r"(val_m)); \ 129 } 130 #else // !(__mips_isa_rev >= 6) 131 #define LH(psrc) \ 132 ({ \ 133 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 134 uint16_t val_m; \ 135 \ 136 __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \ 137 \ 138 : [val_m] "=r"(val_m) \ 139 : [psrc_m] "m"(*psrc_m)); \ 140 \ 141 val_m; \ 142 }) 143 144 #define LW(psrc) \ 145 ({ \ 146 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 147 uint32_t val_m; \ 148 \ 149 __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \ 150 \ 151 : [val_m] "=r"(val_m) \ 152 : [psrc_m] "m"(*psrc_m)); \ 153 \ 154 val_m; \ 155 }) 156 157 #if (__mips == 64) 158 #define LD(psrc) \ 159 ({ \ 160 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 161 uint64_t val_m = 0; \ 162 \ 163 __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \ 164 \ 165 : [val_m] "=r"(val_m) \ 166 : [psrc_m] "m"(*psrc_m)); \ 167 \ 168 val_m; \ 169 }) 170 #else // !(__mips == 64) 171 #define LD(psrc) \ 172 ({ \ 173 const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ 174 uint32_t val0_m, val1_m; \ 175 uint64_t val_m_combined = 0; \ 176 \ 177 val0_m = LW(psrc_m1); \ 178 val1_m = LW(psrc_m1 + 4); \ 179 \ 180 val_m_combined = (uint64_t)(val1_m); \ 181 val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \ 182 val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m); \ 183 \ 184 val_m_combined; \ 185 }) 186 #endif // (__mips == 64) 187 188 #define SH(val, pdst) \ 189 { \ 190 uint8_t *pdst_m = (uint8_t *)(pdst); \ 191 const uint16_t val_m = (val); \ 192 \ 193 __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \ 194 \ 195 : [pdst_m] "=m"(*pdst_m) \ 196 : [val_m] "r"(val_m)); \ 197 } 198 199 #define SW(val, pdst) \ 200 { \ 201 uint8_t *pdst_m = (uint8_t *)(pdst); \ 202 const uint32_t val_m = (val); \ 203 \ 204 __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \ 205 \ 206 : [pdst_m] "=m"(*pdst_m) \ 207 : [val_m] "r"(val_m)); \ 208 } 209 210 #define SD(val, pdst) \ 211 { \ 212 uint8_t *pdst_m1 = (uint8_t *)(pdst); \ 213 uint32_t val0_m, val1_m; \ 214 \ 215 val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ 216 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ 217 \ 218 SW(val0_m, pdst_m1); \ 219 SW(val1_m, pdst_m1 + 4); \ 220 } 221 #endif // (__mips_isa_rev >= 6) 222 223 /* Description : Load 4 words with stride 224 Arguments : Inputs - psrc, stride 225 Outputs - out0, out1, out2, out3 226 Details : Load word in 'out0' from (psrc) 227 Load word in 'out1' from (psrc + stride) 228 Load word in 'out2' from (psrc + 2 * stride) 229 Load word in 'out3' from (psrc + 3 * stride) 230 */ 231 #define LW4(psrc, stride, out0, out1, out2, out3) \ 232 { \ 233 out0 = LW((psrc)); \ 234 out1 = LW((psrc) + stride); \ 235 out2 = LW((psrc) + 2 * stride); \ 236 out3 = LW((psrc) + 3 * stride); \ 237 } 238 239 /* Description : Load double words with stride 240 Arguments : Inputs - psrc, stride 241 Outputs - out0, out1 242 Details : Load double word in 'out0' from (psrc) 243 Load double word in 'out1' from (psrc + stride) 244 */ 245 #define LD2(psrc, stride, out0, out1) \ 246 { \ 247 out0 = LD((psrc)); \ 248 out1 = LD((psrc) + stride); \ 249 } 250 #define LD4(psrc, stride, out0, out1, out2, out3) \ 251 { \ 252 LD2((psrc), stride, out0, out1); \ 253 LD2((psrc) + 2 * stride, stride, out2, out3); \ 254 } 255 256 /* Description : Store 4 words with stride 257 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 258 Details : Store word from 'in0' to (pdst) 259 Store word from 'in1' to (pdst + stride) 260 Store word from 'in2' to (pdst + 2 * stride) 261 Store word from 'in3' to (pdst + 3 * stride) 262 */ 263 #define SW4(in0, in1, in2, in3, pdst, stride) \ 264 { \ 265 SW(in0, (pdst)) \ 266 SW(in1, (pdst) + stride); \ 267 SW(in2, (pdst) + 2 * stride); \ 268 SW(in3, (pdst) + 3 * stride); \ 269 } 270 271 /* Description : Store 4 double words with stride 272 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 273 Details : Store double word from 'in0' to (pdst) 274 Store double word from 'in1' to (pdst + stride) 275 Store double word from 'in2' to (pdst + 2 * stride) 276 Store double word from 'in3' to (pdst + 3 * stride) 277 */ 278 #define SD4(in0, in1, in2, in3, pdst, stride) \ 279 { \ 280 SD(in0, (pdst)) \ 281 SD(in1, (pdst) + stride); \ 282 SD(in2, (pdst) + 2 * stride); \ 283 SD(in3, (pdst) + 3 * stride); \ 284 } 285 286 /* Description : Load vectors with 16 byte elements with stride 287 Arguments : Inputs - psrc, stride 288 Outputs - out0, out1 289 Return Type - as per RTYPE 290 Details : Load 16 byte elements in 'out0' from (psrc) 291 Load 16 byte elements in 'out1' from (psrc + stride) 292 */ 293 #define LD_B2(RTYPE, psrc, stride, out0, out1) \ 294 { \ 295 out0 = LD_B(RTYPE, (psrc)); \ 296 out1 = LD_B(RTYPE, (psrc) + stride); \ 297 } 298 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) 299 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) 300 301 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ 302 { \ 303 LD_B2(RTYPE, (psrc), stride, out0, out1); \ 304 out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ 305 } 306 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) 307 308 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 309 { \ 310 LD_B2(RTYPE, (psrc), stride, out0, out1); \ 311 LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ 312 } 313 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) 314 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) 315 316 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ 317 { \ 318 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 319 out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ 320 } 321 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) 322 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) 323 324 #define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ 325 { \ 326 LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ 327 LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ 328 } 329 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) 330 331 #define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ 332 out7) \ 333 { \ 334 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 335 LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ 336 } 337 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) 338 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) 339 340 /* Description : Load vectors with 8 halfword elements with stride 341 Arguments : Inputs - psrc, stride 342 Outputs - out0, out1 343 Details : Load 8 halfword elements in 'out0' from (psrc) 344 Load 8 halfword elements in 'out1' from (psrc + stride) 345 */ 346 #define LD_H2(RTYPE, psrc, stride, out0, out1) \ 347 { \ 348 out0 = LD_H(RTYPE, (psrc)); \ 349 out1 = LD_H(RTYPE, (psrc) + (stride)); \ 350 } 351 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) 352 353 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 354 { \ 355 LD_H2(RTYPE, (psrc), stride, out0, out1); \ 356 LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ 357 } 358 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) 359 360 #define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ 361 out7) \ 362 { \ 363 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 364 LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ 365 } 366 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) 367 368 #define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ 369 out7, out8, out9, out10, out11, out12, out13, out14, out15) \ 370 { \ 371 LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ 372 out7); \ 373 LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ 374 out13, out14, out15); \ 375 } 376 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) 377 378 /* Description : Load 4x4 block of signed halfword elements from 1D source 379 data into 4 vectors (Each vector with 4 signed halfwords) 380 Arguments : Input - psrc 381 Outputs - out0, out1, out2, out3 382 */ 383 #define LD4x4_SH(psrc, out0, out1, out2, out3) \ 384 { \ 385 out0 = LD_SH(psrc); \ 386 out2 = LD_SH(psrc + 8); \ 387 out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ 388 out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ 389 } 390 391 /* Description : Load 2 vectors of signed word elements with stride 392 Arguments : Inputs - psrc, stride 393 Outputs - out0, out1 394 Return Type - signed word 395 */ 396 #define LD_SW2(psrc, stride, out0, out1) \ 397 { \ 398 out0 = LD_SW((psrc)); \ 399 out1 = LD_SW((psrc) + stride); \ 400 } 401 402 /* Description : Store vectors of 16 byte elements with stride 403 Arguments : Inputs - in0, in1, pdst, stride 404 Details : Store 16 byte elements from 'in0' to (pdst) 405 Store 16 byte elements from 'in1' to (pdst + stride) 406 */ 407 #define ST_B2(RTYPE, in0, in1, pdst, stride) \ 408 { \ 409 ST_B(RTYPE, in0, (pdst)); \ 410 ST_B(RTYPE, in1, (pdst) + stride); \ 411 } 412 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) 413 414 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ 415 { \ 416 ST_B2(RTYPE, in0, in1, (pdst), stride); \ 417 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 418 } 419 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) 420 421 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 422 { \ 423 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ 424 ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ 425 } 426 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) 427 428 /* Description : Store vectors of 8 halfword elements with stride 429 Arguments : Inputs - in0, in1, pdst, stride 430 Details : Store 8 halfword elements from 'in0' to (pdst) 431 Store 8 halfword elements from 'in1' to (pdst + stride) 432 */ 433 #define ST_H2(RTYPE, in0, in1, pdst, stride) \ 434 { \ 435 ST_H(RTYPE, in0, (pdst)); \ 436 ST_H(RTYPE, in1, (pdst) + stride); \ 437 } 438 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) 439 440 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \ 441 { \ 442 ST_H2(RTYPE, in0, in1, (pdst), stride); \ 443 ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 444 } 445 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) 446 447 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 448 { \ 449 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ 450 ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ 451 } 452 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) 453 454 /* Description : Store vectors of word elements with stride 455 Arguments : Inputs - in0, in1, pdst, stride 456 Details : Store 4 word elements from 'in0' to (pdst) 457 Store 4 word elements from 'in1' to (pdst + stride) 458 */ 459 #define ST_SW2(in0, in1, pdst, stride) \ 460 { \ 461 ST_SW(in0, (pdst)); \ 462 ST_SW(in1, (pdst) + stride); \ 463 } 464 465 /* Description : Store 2x4 byte block to destination memory from input vector 466 Arguments : Inputs - in, stidx, pdst, stride 467 Details : Index 'stidx' halfword element from 'in' vector is copied to 468 the GP register and stored to (pdst) 469 Index 'stidx+1' halfword element from 'in' vector is copied to 470 the GP register and stored to (pdst + stride) 471 Index 'stidx+2' halfword element from 'in' vector is copied to 472 the GP register and stored to (pdst + 2 * stride) 473 Index 'stidx+3' halfword element from 'in' vector is copied to 474 the GP register and stored to (pdst + 3 * stride) 475 */ 476 #define ST2x4_UB(in, stidx, pdst, stride) \ 477 { \ 478 uint16_t out0_m, out1_m, out2_m, out3_m; \ 479 uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ 480 \ 481 out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ 482 out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ 483 out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ 484 out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ 485 \ 486 SH(out0_m, pblk_2x4_m); \ 487 SH(out1_m, pblk_2x4_m + stride); \ 488 SH(out2_m, pblk_2x4_m + 2 * stride); \ 489 SH(out3_m, pblk_2x4_m + 3 * stride); \ 490 } 491 492 /* Description : Store 4x2 byte block to destination memory from input vector 493 Arguments : Inputs - in, pdst, stride 494 Details : Index 0 word element from 'in' vector is copied to the GP 495 register and stored to (pdst) 496 Index 1 word element from 'in' vector is copied to the GP 497 register and stored to (pdst + stride) 498 */ 499 #define ST4x2_UB(in, pdst, stride) \ 500 { \ 501 uint32_t out0_m, out1_m; \ 502 uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ 503 \ 504 out0_m = __msa_copy_u_w((v4i32)in, 0); \ 505 out1_m = __msa_copy_u_w((v4i32)in, 1); \ 506 \ 507 SW(out0_m, pblk_4x2_m); \ 508 SW(out1_m, pblk_4x2_m + stride); \ 509 } 510 511 /* Description : Store 4x4 byte block to destination memory from input vector 512 Arguments : Inputs - in0, in1, pdst, stride 513 Details : 'Idx0' word element from input vector 'in0' is copied to the 514 GP register and stored to (pdst) 515 'Idx1' word element from input vector 'in0' is copied to the 516 GP register and stored to (pdst + stride) 517 'Idx2' word element from input vector 'in0' is copied to the 518 GP register and stored to (pdst + 2 * stride) 519 'Idx3' word element from input vector 'in0' is copied to the 520 GP register and stored to (pdst + 3 * stride) 521 */ 522 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ 523 { \ 524 uint32_t out0_m, out1_m, out2_m, out3_m; \ 525 uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ 526 \ 527 out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ 528 out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ 529 out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ 530 out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ 531 \ 532 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ 533 } 534 #define ST4x8_UB(in0, in1, pdst, stride) \ 535 { \ 536 uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ 537 \ 538 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ 539 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ 540 } 541 542 /* Description : Store 8x1 byte block to destination memory from input vector 543 Arguments : Inputs - in, pdst 544 Details : Index 0 double word element from 'in' vector is copied to the 545 GP register and stored to (pdst) 546 */ 547 #define ST8x1_UB(in, pdst) \ 548 { \ 549 uint64_t out0_m; \ 550 \ 551 out0_m = __msa_copy_u_d((v2i64)in, 0); \ 552 SD(out0_m, pdst); \ 553 } 554 555 /* Description : Store 8x2 byte block to destination memory from input vector 556 Arguments : Inputs - in, pdst, stride 557 Details : Index 0 double word element from 'in' vector is copied to the 558 GP register and stored to (pdst) 559 Index 1 double word element from 'in' vector is copied to the 560 GP register and stored to (pdst + stride) 561 */ 562 #define ST8x2_UB(in, pdst, stride) \ 563 { \ 564 uint64_t out0_m, out1_m; \ 565 uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ 566 \ 567 out0_m = __msa_copy_u_d((v2i64)in, 0); \ 568 out1_m = __msa_copy_u_d((v2i64)in, 1); \ 569 \ 570 SD(out0_m, pblk_8x2_m); \ 571 SD(out1_m, pblk_8x2_m + stride); \ 572 } 573 574 /* Description : Store 8x4 byte block to destination memory from input 575 vectors 576 Arguments : Inputs - in0, in1, pdst, stride 577 Details : Index 0 double word element from 'in0' vector is copied to the 578 GP register and stored to (pdst) 579 Index 1 double word element from 'in0' vector is copied to the 580 GP register and stored to (pdst + stride) 581 Index 0 double word element from 'in1' vector is copied to the 582 GP register and stored to (pdst + 2 * stride) 583 Index 1 double word element from 'in1' vector is copied to the 584 GP register and stored to (pdst + 3 * stride) 585 */ 586 #define ST8x4_UB(in0, in1, pdst, stride) \ 587 { \ 588 uint64_t out0_m, out1_m, out2_m, out3_m; \ 589 uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ 590 \ 591 out0_m = __msa_copy_u_d((v2i64)in0, 0); \ 592 out1_m = __msa_copy_u_d((v2i64)in0, 1); \ 593 out2_m = __msa_copy_u_d((v2i64)in1, 0); \ 594 out3_m = __msa_copy_u_d((v2i64)in1, 1); \ 595 \ 596 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ 597 } 598 599 /* Description : average with rounding (in0 + in1 + 1) / 2. 600 Arguments : Inputs - in0, in1, in2, in3, 601 Outputs - out0, out1 602 Return Type - as per RTYPE 603 Details : Each unsigned byte element from 'in0' vector is added with 604 each unsigned byte element from 'in1' vector. Then the average 605 with rounding is calculated and written to 'out0' 606 */ 607 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ 608 { \ 609 out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ 610 out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ 611 } 612 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) 613 614 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 615 out2, out3) \ 616 { \ 617 AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ 618 AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ 619 } 620 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) 621 622 /* Description : Immediate number of elements to slide with zero 623 Arguments : Inputs - in0, in1, slide_val 624 Outputs - out0, out1 625 Return Type - as per RTYPE 626 Details : Byte elements from 'zero_m' vector are slid into 'in0' by 627 value specified in the 'slide_val' 628 */ 629 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ 630 { \ 631 v16i8 zero_m = { 0 }; \ 632 out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ 633 out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ 634 } 635 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) 636 637 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \ 638 slide_val) \ 639 { \ 640 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ 641 SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ 642 } 643 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) 644 645 /* Description : Immediate number of elements to slide 646 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val 647 Outputs - out0, out1 648 Return Type - as per RTYPE 649 Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by 650 value specified in the 'slide_val' 651 */ 652 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ 653 { \ 654 out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ 655 out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ 656 } 657 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) 658 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) 659 660 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \ 661 out2, slide_val) \ 662 { \ 663 SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ 664 out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ 665 } 666 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) 667 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) 668 669 /* Description : Shuffle byte vector elements as per mask vector 670 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 671 Outputs - out0, out1 672 Return Type - as per RTYPE 673 Details : Byte elements from 'in0' & 'in1' are copied selectively to 674 'out0' as per control vector 'mask0' 675 */ 676 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 677 { \ 678 out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ 679 out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ 680 } 681 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) 682 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) 683 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) 684 685 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \ 686 out3) \ 687 { \ 688 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ 689 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ 690 } 691 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) 692 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) 693 694 /* Description : Dot product of byte vector elements 695 Arguments : Inputs - mult0, mult1, cnst0, cnst1 696 Outputs - out0, out1 697 Return Type - as per RTYPE 698 Details : Unsigned byte elements from 'mult0' are multiplied with 699 unsigned byte elements from 'cnst0' producing a result 700 twice the size of input i.e. unsigned halfword. 701 The multiplication result of adjacent odd-even elements 702 are added together and written to the 'out0' vector 703 */ 704 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 705 { \ 706 out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ 707 out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ 708 } 709 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) 710 711 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 712 cnst3, out0, out1, out2, out3) \ 713 { \ 714 DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 715 DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 716 } 717 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) 718 719 /* Description : Dot product of byte vector elements 720 Arguments : Inputs - mult0, mult1, cnst0, cnst1 721 Outputs - out0, out1 722 Return Type - as per RTYPE 723 Details : Signed byte elements from 'mult0' are multiplied with 724 signed byte elements from 'cnst0' producing a result 725 twice the size of input i.e. signed halfword. 726 The multiplication result of adjacent odd-even elements 727 are added together and written to the 'out0' vector 728 */ 729 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 730 { \ 731 out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ 732 out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ 733 } 734 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) 735 736 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 737 cnst3, out0, out1, out2, out3) \ 738 { \ 739 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 740 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 741 } 742 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) 743 744 /* Description : Dot product of halfword vector elements 745 Arguments : Inputs - mult0, mult1, cnst0, cnst1 746 Outputs - out0, out1 747 Return Type - as per RTYPE 748 Details : Signed halfword elements from 'mult0' are multiplied with 749 signed halfword elements from 'cnst0' producing a result 750 twice the size of input i.e. signed word. 751 The multiplication result of adjacent odd-even elements 752 are added together and written to the 'out0' vector 753 */ 754 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 755 { \ 756 out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ 757 out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ 758 } 759 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) 760 761 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 762 cnst3, out0, out1, out2, out3) \ 763 { \ 764 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 765 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 766 } 767 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) 768 769 /* Description : Dot product of word vector elements 770 Arguments : Inputs - mult0, mult1, cnst0, cnst1 771 Outputs - out0, out1 772 Return Type - as per RTYPE 773 Details : Signed word elements from 'mult0' are multiplied with 774 signed word elements from 'cnst0' producing a result 775 twice the size of input i.e. signed double word. 776 The multiplication result of adjacent odd-even elements 777 are added together and written to the 'out0' vector 778 */ 779 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 780 { \ 781 out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ 782 out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ 783 } 784 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) 785 786 /* Description : Dot product & addition of byte vector elements 787 Arguments : Inputs - mult0, mult1, cnst0, cnst1 788 Outputs - out0, out1 789 Return Type - as per RTYPE 790 Details : Signed byte elements from 'mult0' are multiplied with 791 signed byte elements from 'cnst0' producing a result 792 twice the size of input i.e. signed halfword. 793 The multiplication result of adjacent odd-even elements 794 are added to the 'out0' vector 795 */ 796 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 797 { \ 798 out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ 799 out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ 800 } 801 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) 802 803 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 804 cnst3, out0, out1, out2, out3) \ 805 { \ 806 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 807 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 808 } 809 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) 810 811 /* Description : Dot product & addition of halfword vector elements 812 Arguments : Inputs - mult0, mult1, cnst0, cnst1 813 Outputs - out0, out1 814 Return Type - as per RTYPE 815 Details : Signed halfword elements from 'mult0' are multiplied with 816 signed halfword elements from 'cnst0' producing a result 817 twice the size of input i.e. signed word. 818 The multiplication result of adjacent odd-even elements 819 are added to the 'out0' vector 820 */ 821 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 822 { \ 823 out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ 824 out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ 825 } 826 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) 827 828 /* Description : Dot product & addition of double word vector elements 829 Arguments : Inputs - mult0, mult1 830 Outputs - out0, out1 831 Return Type - as per RTYPE 832 Details : Each signed word element from 'mult0' is multiplied with itself 833 producing an intermediate result twice the size of input 834 i.e. signed double word 835 The multiplication result of adjacent odd-even elements 836 are added to the 'out0' vector 837 */ 838 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ 839 { \ 840 out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ 841 out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ 842 } 843 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) 844 845 /* Description : Minimum values between unsigned elements of 846 either vector are copied to the output vector 847 Arguments : Inputs - in0, in1, min_vec 848 Outputs - in place operation 849 Return Type - as per RTYPE 850 Details : Minimum of unsigned halfword element values from 'in0' and 851 'min_vec' are written to output vector 'in0' 852 */ 853 #define MIN_UH2(RTYPE, in0, in1, min_vec) \ 854 { \ 855 in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ 856 in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ 857 } 858 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) 859 860 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \ 861 { \ 862 MIN_UH2(RTYPE, in0, in1, min_vec); \ 863 MIN_UH2(RTYPE, in2, in3, min_vec); \ 864 } 865 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) 866 867 /* Description : Clips all signed halfword elements of input vector 868 between 0 & 255 869 Arguments : Input - in 870 Output - out_m 871 Return Type - signed halfword 872 */ 873 #define CLIP_SH_0_255(in) \ 874 ({ \ 875 v8i16 max_m = __msa_ldi_h(255); \ 876 v8i16 out_m; \ 877 \ 878 out_m = __msa_maxi_s_h((v8i16)in, 0); \ 879 out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ 880 out_m; \ 881 }) 882 #define CLIP_SH2_0_255(in0, in1) \ 883 { \ 884 in0 = CLIP_SH_0_255(in0); \ 885 in1 = CLIP_SH_0_255(in1); \ 886 } 887 #define CLIP_SH4_0_255(in0, in1, in2, in3) \ 888 { \ 889 CLIP_SH2_0_255(in0, in1); \ 890 CLIP_SH2_0_255(in2, in3); \ 891 } 892 893 /* Description : Horizontal addition of 4 signed word elements of input vector 894 Arguments : Input - in (signed word vector) 895 Output - sum_m (i32 sum) 896 Return Type - signed word (GP) 897 Details : 4 signed word elements of 'in' vector are added together and 898 the resulting integer sum is returned 899 */ 900 #define HADD_SW_S32(in) \ 901 ({ \ 902 v2i64 res0_m, res1_m; \ 903 int32_t sum_m; \ 904 \ 905 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ 906 res1_m = __msa_splati_d(res0_m, 1); \ 907 res0_m = res0_m + res1_m; \ 908 sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ 909 sum_m; \ 910 }) 911 912 /* Description : Horizontal addition of 4 unsigned word elements 913 Arguments : Input - in (unsigned word vector) 914 Output - sum_m (u32 sum) 915 Return Type - unsigned word (GP) 916 Details : 4 unsigned word elements of 'in' vector are added together and 917 the resulting integer sum is returned 918 */ 919 #define HADD_UW_U32(in) \ 920 ({ \ 921 v2u64 res0_m, res1_m; \ 922 uint32_t sum_m; \ 923 \ 924 res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \ 925 res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ 926 res0_m += res1_m; \ 927 sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ 928 sum_m; \ 929 }) 930 931 /* Description : Horizontal addition of 8 unsigned halfword elements 932 Arguments : Input - in (unsigned halfword vector) 933 Output - sum_m (u32 sum) 934 Return Type - unsigned word 935 Details : 8 unsigned halfword elements of 'in' vector are added 936 together and the resulting integer sum is returned 937 */ 938 #define HADD_UH_U32(in) \ 939 ({ \ 940 v4u32 res_m; \ 941 uint32_t sum_m; \ 942 \ 943 res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ 944 sum_m = HADD_UW_U32(res_m); \ 945 sum_m; \ 946 }) 947 948 /* Description : Horizontal addition of unsigned byte vector elements 949 Arguments : Inputs - in0, in1 950 Outputs - out0, out1 951 Return Type - as per RTYPE 952 Details : Each unsigned odd byte element from 'in0' is added to 953 even unsigned byte element from 'in0' (pairwise) and the 954 halfword result is written to 'out0' 955 */ 956 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \ 957 { \ 958 out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ 959 out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ 960 } 961 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) 962 963 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ 964 { \ 965 HADD_UB2(RTYPE, in0, in1, out0, out1); \ 966 HADD_UB2(RTYPE, in2, in3, out2, out3); \ 967 } 968 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) 969 970 /* Description : Horizontal subtraction of unsigned byte vector elements 971 Arguments : Inputs - in0, in1 972 Outputs - out0, out1 973 Return Type - as per RTYPE 974 Details : Each unsigned odd byte element from 'in0' is subtracted from 975 even unsigned byte element from 'in0' (pairwise) and the 976 halfword result is written to 'out0' 977 */ 978 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ 979 { \ 980 out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ 981 out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ 982 } 983 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) 984 985 /* Description : SAD (Sum of Absolute Difference) 986 Arguments : Inputs - in0, in1, ref0, ref1 987 Outputs - sad_m (halfword vector) 988 Return Type - unsigned halfword 989 Details : Absolute difference of all the byte elements from 'in0' with 990 'ref0' is calculated and preserved in 'diff0'. Then even-odd 991 pairs are added together to generate 8 halfword results. 992 */ 993 #define SAD_UB2_UH(in0, in1, ref0, ref1) \ 994 ({ \ 995 v16u8 diff0_m, diff1_m; \ 996 v8u16 sad_m = { 0 }; \ 997 \ 998 diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \ 999 diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \ 1000 \ 1001 sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \ 1002 sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \ 1003 \ 1004 sad_m; \ 1005 }) 1006 1007 /* Description : Horizontal subtraction of signed halfword vector elements 1008 Arguments : Inputs - in0, in1 1009 Outputs - out0, out1 1010 Return Type - as per RTYPE 1011 Details : Each signed odd halfword element from 'in0' is subtracted from 1012 even signed halfword element from 'in0' (pairwise) and the 1013 word result is written to 'out0' 1014 */ 1015 #define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ 1016 { \ 1017 out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ 1018 out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ 1019 } 1020 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) 1021 1022 /* Description : Set element n input vector to GPR value 1023 Arguments : Inputs - in0, in1, in2, in3 1024 Output - out 1025 Return Type - as per RTYPE 1026 Details : Set element 0 in vector 'out' to value specified in 'in0' 1027 */ 1028 #define INSERT_W2(RTYPE, in0, in1, out) \ 1029 { \ 1030 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ 1031 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ 1032 } 1033 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) 1034 1035 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \ 1036 { \ 1037 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ 1038 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ 1039 out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ 1040 out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ 1041 } 1042 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) 1043 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) 1044 1045 #define INSERT_D2(RTYPE, in0, in1, out) \ 1046 { \ 1047 out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ 1048 out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ 1049 } 1050 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) 1051 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) 1052 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__) 1053 1054 /* Description : Interleave even byte elements from vectors 1055 Arguments : Inputs - in0, in1, in2, in3 1056 Outputs - out0, out1 1057 Return Type - as per RTYPE 1058 Details : Even byte elements of 'in0' and 'in1' are interleaved 1059 and written to 'out0' 1060 */ 1061 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1062 { \ 1063 out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ 1064 out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ 1065 } 1066 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) 1067 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) 1068 1069 /* Description : Interleave even halfword elements from vectors 1070 Arguments : Inputs - in0, in1, in2, in3 1071 Outputs - out0, out1 1072 Return Type - as per RTYPE 1073 Details : Even halfword elements of 'in0' and 'in1' are interleaved 1074 and written to 'out0' 1075 */ 1076 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1077 { \ 1078 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ 1079 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ 1080 } 1081 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) 1082 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) 1083 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) 1084 1085 /* Description : Interleave even word elements from vectors 1086 Arguments : Inputs - in0, in1, in2, in3 1087 Outputs - out0, out1 1088 Return Type - as per RTYPE 1089 Details : Even word elements of 'in0' and 'in1' are interleaved 1090 and written to 'out0' 1091 */ 1092 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1093 { \ 1094 out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ 1095 out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ 1096 } 1097 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) 1098 1099 /* Description : Interleave even double word elements from vectors 1100 Arguments : Inputs - in0, in1, in2, in3 1101 Outputs - out0, out1 1102 Return Type - as per RTYPE 1103 Details : Even double word elements of 'in0' and 'in1' are interleaved 1104 and written to 'out0' 1105 */ 1106 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1107 { \ 1108 out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ 1109 out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ 1110 } 1111 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) 1112 1113 /* Description : Interleave left half of byte elements from vectors 1114 Arguments : Inputs - in0, in1, in2, in3 1115 Outputs - out0, out1 1116 Return Type - as per RTYPE 1117 Details : Left half of byte elements of 'in0' and 'in1' are interleaved 1118 and written to 'out0'. 1119 */ 1120 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1121 { \ 1122 out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 1123 out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ 1124 } 1125 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) 1126 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) 1127 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) 1128 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) 1129 1130 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1131 out2, out3) \ 1132 { \ 1133 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1134 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1135 } 1136 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) 1137 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) 1138 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) 1139 1140 /* Description : Interleave left half of halfword elements from vectors 1141 Arguments : Inputs - in0, in1, in2, in3 1142 Outputs - out0, out1 1143 Return Type - as per RTYPE 1144 Details : Left half of halfword elements of 'in0' and 'in1' are 1145 interleaved and written to 'out0'. 1146 */ 1147 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1148 { \ 1149 out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 1150 out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ 1151 } 1152 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) 1153 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) 1154 1155 /* Description : Interleave left half of word elements from vectors 1156 Arguments : Inputs - in0, in1, in2, in3 1157 Outputs - out0, out1 1158 Return Type - as per RTYPE 1159 Details : Left half of word elements of 'in0' and 'in1' are interleaved 1160 and written to 'out0'. 1161 */ 1162 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1163 { \ 1164 out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 1165 out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ 1166 } 1167 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) 1168 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) 1169 1170 /* Description : Interleave right half of byte elements from vectors 1171 Arguments : Inputs - in0, in1, in2, in3 1172 Outputs - out0, out1 1173 Return Type - as per RTYPE 1174 Details : Right half of byte elements of 'in0' and 'in1' are interleaved 1175 and written to out0. 1176 */ 1177 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1178 { \ 1179 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 1180 out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ 1181 } 1182 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) 1183 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) 1184 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) 1185 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) 1186 1187 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1188 out2, out3) \ 1189 { \ 1190 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1191 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1192 } 1193 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) 1194 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) 1195 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) 1196 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) 1197 1198 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ 1199 in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \ 1200 out5, out6, out7) \ 1201 { \ 1202 ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ 1203 out3); \ 1204 ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \ 1205 out6, out7); \ 1206 } 1207 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) 1208 1209 /* Description : Interleave right half of halfword elements from vectors 1210 Arguments : Inputs - in0, in1, in2, in3 1211 Outputs - out0, out1 1212 Return Type - as per RTYPE 1213 Details : Right half of halfword elements of 'in0' and 'in1' are 1214 interleaved and written to 'out0'. 1215 */ 1216 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1217 { \ 1218 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 1219 out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ 1220 } 1221 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) 1222 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) 1223 1224 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1225 out2, out3) \ 1226 { \ 1227 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1228 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1229 } 1230 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) 1231 1232 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1233 { \ 1234 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 1235 out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ 1236 } 1237 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) 1238 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) 1239 1240 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1241 out2, out3) \ 1242 { \ 1243 ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1244 ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1245 } 1246 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) 1247 1248 /* Description : Interleave right half of double word elements from vectors 1249 Arguments : Inputs - in0, in1, in2, in3 1250 Outputs - out0, out1 1251 Return Type - as per RTYPE 1252 Details : Right half of double word elements of 'in0' and 'in1' are 1253 interleaved and written to 'out0'. 1254 */ 1255 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1256 { \ 1257 out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ 1258 out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ 1259 } 1260 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) 1261 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) 1262 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) 1263 1264 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ 1265 { \ 1266 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1267 out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ 1268 } 1269 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) 1270 1271 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1272 out2, out3) \ 1273 { \ 1274 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1275 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1276 } 1277 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) 1278 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) 1279 1280 /* Description : Interleave both left and right half of input vectors 1281 Arguments : Inputs - in0, in1 1282 Outputs - out0, out1 1283 Return Type - as per RTYPE 1284 Details : Right half of byte elements from 'in0' and 'in1' are 1285 interleaved and written to 'out0' 1286 */ 1287 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ 1288 { \ 1289 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 1290 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 1291 } 1292 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) 1293 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) 1294 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) 1295 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) 1296 1297 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ 1298 { \ 1299 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 1300 out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 1301 } 1302 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) 1303 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) 1304 1305 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ 1306 { \ 1307 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 1308 out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 1309 } 1310 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) 1311 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) 1312 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) 1313 1314 /* Description : Saturate the halfword element values to the max 1315 unsigned value of (sat_val + 1) bits 1316 The element data width remains unchanged 1317 Arguments : Inputs - in0, in1, sat_val 1318 Outputs - in place operation 1319 Return Type - as per RTYPE 1320 Details : Each unsigned halfword element from 'in0' is saturated to the 1321 value generated with (sat_val + 1) bit range. 1322 The results are written in place 1323 */ 1324 #define SAT_UH2(RTYPE, in0, in1, sat_val) \ 1325 { \ 1326 in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ 1327 in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ 1328 } 1329 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) 1330 1331 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ 1332 { \ 1333 SAT_UH2(RTYPE, in0, in1, sat_val); \ 1334 SAT_UH2(RTYPE, in2, in3, sat_val) \ 1335 } 1336 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) 1337 1338 /* Description : Saturate the halfword element values to the max 1339 unsigned value of (sat_val + 1) bits 1340 The element data width remains unchanged 1341 Arguments : Inputs - in0, in1, sat_val 1342 Outputs - in place operation 1343 Return Type - as per RTYPE 1344 Details : Each unsigned halfword element from 'in0' is saturated to the 1345 value generated with (sat_val + 1) bit range 1346 The results are written in place 1347 */ 1348 #define SAT_SH2(RTYPE, in0, in1, sat_val) \ 1349 { \ 1350 in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ 1351 in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ 1352 } 1353 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) 1354 1355 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ 1356 { \ 1357 SAT_SH2(RTYPE, in0, in1, sat_val); \ 1358 SAT_SH2(RTYPE, in2, in3, sat_val); \ 1359 } 1360 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) 1361 1362 /* Description : Indexed halfword element values are replicated to all 1363 elements in output vector 1364 Arguments : Inputs - in, idx0, idx1 1365 Outputs - out0, out1 1366 Return Type - as per RTYPE 1367 Details : 'idx0' element value from 'in' vector is replicated to all 1368 elements in 'out0' vector 1369 Valid index range for halfword operation is 0-7 1370 */ 1371 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ 1372 { \ 1373 out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ 1374 out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ 1375 } 1376 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) 1377 1378 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \ 1379 { \ 1380 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ 1381 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ 1382 } 1383 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) 1384 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) 1385 1386 /* Description : Pack even byte elements of vector pairs 1387 Arguments : Inputs - in0, in1, in2, in3 1388 Outputs - out0, out1 1389 Return Type - as per RTYPE 1390 Details : Even byte elements of 'in0' are copied to the left half of 1391 'out0' & even byte elements of 'in1' are copied to the right 1392 half of 'out0'. 1393 */ 1394 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1395 { \ 1396 out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ 1397 out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ 1398 } 1399 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) 1400 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) 1401 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) 1402 1403 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1404 out2, out3) \ 1405 { \ 1406 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1407 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1408 } 1409 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) 1410 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) 1411 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) 1412 1413 /* Description : Pack even halfword elements of vector pairs 1414 Arguments : Inputs - in0, in1, in2, in3 1415 Outputs - out0, out1 1416 Return Type - as per RTYPE 1417 Details : Even halfword elements of 'in0' are copied to the left half of 1418 'out0' & even halfword elements of 'in1' are copied to the 1419 right half of 'out0'. 1420 */ 1421 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1422 { \ 1423 out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ 1424 out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ 1425 } 1426 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) 1427 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) 1428 1429 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1430 out2, out3) \ 1431 { \ 1432 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1433 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1434 } 1435 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) 1436 1437 /* Description : Pack even double word elements of vector pairs 1438 Arguments : Inputs - in0, in1, in2, in3 1439 Outputs - out0, out1 1440 Return Type - as per RTYPE 1441 Details : Even double elements of 'in0' are copied to the left half of 1442 'out0' & even double elements of 'in1' are copied to the right 1443 half of 'out0'. 1444 */ 1445 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1446 { \ 1447 out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ 1448 out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ 1449 } 1450 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) 1451 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) 1452 1453 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1454 out2, out3) \ 1455 { \ 1456 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1457 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1458 } 1459 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) 1460 1461 /* Description : Each byte element is logically xor'ed with immediate 128 1462 Arguments : Inputs - in0, in1 1463 Outputs - in place operation 1464 Return Type - as per RTYPE 1465 Details : Each unsigned byte element from input vector 'in0' is 1466 logically xor'ed with 128 and the result is stored in-place. 1467 */ 1468 #define XORI_B2_128(RTYPE, in0, in1) \ 1469 { \ 1470 in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ 1471 in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ 1472 } 1473 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) 1474 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) 1475 1476 #define XORI_B3_128(RTYPE, in0, in1, in2) \ 1477 { \ 1478 XORI_B2_128(RTYPE, in0, in1); \ 1479 in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ 1480 } 1481 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) 1482 1483 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ 1484 { \ 1485 XORI_B2_128(RTYPE, in0, in1); \ 1486 XORI_B2_128(RTYPE, in2, in3); \ 1487 } 1488 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) 1489 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) 1490 1491 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \ 1492 { \ 1493 XORI_B4_128(RTYPE, in0, in1, in2, in3); \ 1494 XORI_B3_128(RTYPE, in4, in5, in6); \ 1495 } 1496 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) 1497 1498 /* Description : Average of signed halfword elements -> (a + b) / 2 1499 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1500 Outputs - out0, out1, out2, out3 1501 Return Type - as per RTYPE 1502 Details : Each signed halfword element from 'in0' is added to each 1503 signed halfword element of 'in1' with full precision resulting 1504 in one extra bit in the result. The result is then divided by 1505 2 and written to 'out0' 1506 */ 1507 #define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1508 out2, out3) \ 1509 { \ 1510 out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ 1511 out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ 1512 out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ 1513 out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ 1514 } 1515 #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) 1516 1517 /* Description : Addition of signed halfword elements and signed saturation 1518 Arguments : Inputs - in0, in1, in2, in3 1519 Outputs - out0, out1 1520 Return Type - as per RTYPE 1521 Details : Signed halfword elements from 'in0' are added to signed 1522 halfword elements of 'in1'. The result is then signed saturated 1523 between halfword data type range 1524 */ 1525 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1526 { \ 1527 out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ 1528 out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ 1529 } 1530 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) 1531 1532 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1533 out2, out3) \ 1534 { \ 1535 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1536 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1537 } 1538 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) 1539 1540 /* Description : Shift left all elements of vector (generic for all data types) 1541 Arguments : Inputs - in0, in1, in2, in3, shift 1542 Outputs - in place operation 1543 Return Type - as per input vector RTYPE 1544 Details : Each element of vector 'in0' is left shifted by 'shift' and 1545 the result is written in-place. 1546 */ 1547 #define SLLI_4V(in0, in1, in2, in3, shift) \ 1548 { \ 1549 in0 = in0 << shift; \ 1550 in1 = in1 << shift; \ 1551 in2 = in2 << shift; \ 1552 in3 = in3 << shift; \ 1553 } 1554 1555 /* Description : Arithmetic shift right all elements of vector 1556 (generic for all data types) 1557 Arguments : Inputs - in0, in1, in2, in3, shift 1558 Outputs - in place operation 1559 Return Type - as per input vector RTYPE 1560 Details : Each element of vector 'in0' is right shifted by 'shift' and 1561 the result is written in-place. 'shift' is a GP variable. 1562 */ 1563 #define SRA_2V(in0, in1, shift) \ 1564 { \ 1565 in0 = in0 >> shift; \ 1566 in1 = in1 >> shift; \ 1567 } 1568 1569 #define SRA_4V(in0, in1, in2, in3, shift) \ 1570 { \ 1571 in0 = in0 >> shift; \ 1572 in1 = in1 >> shift; \ 1573 in2 = in2 >> shift; \ 1574 in3 = in3 >> shift; \ 1575 } 1576 1577 /* Description : Shift right arithmetic rounded words 1578 Arguments : Inputs - in0, in1, shift 1579 Outputs - in place operation 1580 Return Type - as per RTYPE 1581 Details : Each element of vector 'in0' is shifted right arithmetically by 1582 the number of bits in the corresponding element in the vector 1583 'shift'. The last discarded bit is added to shifted value for 1584 rounding and the result is written in-place. 1585 'shift' is a vector. 1586 */ 1587 #define SRAR_W2(RTYPE, in0, in1, shift) \ 1588 { \ 1589 in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ 1590 in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ 1591 } 1592 1593 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ 1594 { \ 1595 SRAR_W2(RTYPE, in0, in1, shift) \ 1596 SRAR_W2(RTYPE, in2, in3, shift) \ 1597 } 1598 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) 1599 1600 /* Description : Shift right arithmetic rounded (immediate) 1601 Arguments : Inputs - in0, in1, shift 1602 Outputs - in place operation 1603 Return Type - as per RTYPE 1604 Details : Each element of vector 'in0' is shifted right arithmetically by 1605 the value in 'shift'. The last discarded bit is added to the 1606 shifted value for rounding and the result is written in-place. 1607 'shift' is an immediate value. 1608 */ 1609 #define SRARI_H2(RTYPE, in0, in1, shift) \ 1610 { \ 1611 in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ 1612 in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ 1613 } 1614 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) 1615 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) 1616 1617 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ 1618 { \ 1619 SRARI_H2(RTYPE, in0, in1, shift); \ 1620 SRARI_H2(RTYPE, in2, in3, shift); \ 1621 } 1622 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) 1623 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) 1624 1625 #define SRARI_W2(RTYPE, in0, in1, shift) \ 1626 { \ 1627 in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ 1628 in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ 1629 } 1630 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) 1631 1632 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ 1633 { \ 1634 SRARI_W2(RTYPE, in0, in1, shift); \ 1635 SRARI_W2(RTYPE, in2, in3, shift); \ 1636 } 1637 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) 1638 1639 /* Description : Logical shift right all elements of vector (immediate) 1640 Arguments : Inputs - in0, in1, in2, in3, shift 1641 Outputs - out0, out1, out2, out3 1642 Return Type - as per RTYPE 1643 Details : Each element of vector 'in0' is right shifted by 'shift' and 1644 the result is written in-place. 'shift' is an immediate value. 1645 */ 1646 #define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \ 1647 { \ 1648 out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ 1649 out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ 1650 out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ 1651 out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ 1652 } 1653 #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) 1654 1655 /* Description : Multiplication of pairs of vectors 1656 Arguments : Inputs - in0, in1, in2, in3 1657 Outputs - out0, out1 1658 Details : Each element from 'in0' is multiplied with elements from 'in1' 1659 and the result is written to 'out0' 1660 */ 1661 #define MUL2(in0, in1, in2, in3, out0, out1) \ 1662 { \ 1663 out0 = in0 * in1; \ 1664 out1 = in2 * in3; \ 1665 } 1666 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1667 { \ 1668 MUL2(in0, in1, in2, in3, out0, out1); \ 1669 MUL2(in4, in5, in6, in7, out2, out3); \ 1670 } 1671 1672 /* Description : Addition of 2 pairs of vectors 1673 Arguments : Inputs - in0, in1, in2, in3 1674 Outputs - out0, out1 1675 Details : Each element in 'in0' is added to 'in1' and result is written 1676 to 'out0'. 1677 */ 1678 #define ADD2(in0, in1, in2, in3, out0, out1) \ 1679 { \ 1680 out0 = in0 + in1; \ 1681 out1 = in2 + in3; \ 1682 } 1683 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1684 { \ 1685 ADD2(in0, in1, in2, in3, out0, out1); \ 1686 ADD2(in4, in5, in6, in7, out2, out3); \ 1687 } 1688 1689 /* Description : Subtraction of 2 pairs of vectors 1690 Arguments : Inputs - in0, in1, in2, in3 1691 Outputs - out0, out1 1692 Details : Each element in 'in1' is subtracted from 'in0' and result is 1693 written to 'out0'. 1694 */ 1695 #define SUB2(in0, in1, in2, in3, out0, out1) \ 1696 { \ 1697 out0 = in0 - in1; \ 1698 out1 = in2 - in3; \ 1699 } 1700 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1701 { \ 1702 out0 = in0 - in1; \ 1703 out1 = in2 - in3; \ 1704 out2 = in4 - in5; \ 1705 out3 = in6 - in7; \ 1706 } 1707 1708 /* Description : Sign extend halfword elements from right half of the vector 1709 Arguments : Input - in (halfword vector) 1710 Output - out (sign extended word vector) 1711 Return Type - signed word 1712 Details : Sign bit of halfword elements from input vector 'in' is 1713 extracted and interleaved with same vector 'in0' to generate 1714 4 word elements keeping sign intact 1715 */ 1716 #define UNPCK_R_SH_SW(in, out) \ 1717 { \ 1718 v8i16 sign_m; \ 1719 \ 1720 sign_m = __msa_clti_s_h((v8i16)in, 0); \ 1721 out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ 1722 } 1723 1724 /* Description : Zero extend unsigned byte elements to halfword elements 1725 Arguments : Input - in (unsigned byte vector) 1726 Outputs - out0, out1 (unsigned halfword vectors) 1727 Return Type - signed halfword 1728 Details : Zero extended right half of vector is returned in 'out0' 1729 Zero extended left half of vector is returned in 'out1' 1730 */ 1731 #define UNPCK_UB_SH(in, out0, out1) \ 1732 { \ 1733 v16i8 zero_m = { 0 }; \ 1734 \ 1735 ILVRL_B2_SH(zero_m, in, out0, out1); \ 1736 } 1737 1738 /* Description : Sign extend halfword elements from input vector and return 1739 the result in pair of vectors 1740 Arguments : Input - in (halfword vector) 1741 Outputs - out0, out1 (sign extended word vectors) 1742 Return Type - signed word 1743 Details : Sign bit of halfword elements from input vector 'in' is 1744 extracted and interleaved right with same vector 'in0' to 1745 generate 4 signed word elements in 'out0' 1746 Then interleaved left with same vector 'in0' to 1747 generate 4 signed word elements in 'out1' 1748 */ 1749 #define UNPCK_SH_SW(in, out0, out1) \ 1750 { \ 1751 v8i16 tmp_m; \ 1752 \ 1753 tmp_m = __msa_clti_s_h((v8i16)in, 0); \ 1754 ILVRL_H2_SW(tmp_m, in, out0, out1); \ 1755 } 1756 1757 /* Description : Butterfly of 4 input vectors 1758 Arguments : Inputs - in0, in1, in2, in3 1759 Outputs - out0, out1, out2, out3 1760 Details : Butterfly operation 1761 */ 1762 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ 1763 { \ 1764 out0 = in0 + in3; \ 1765 out1 = in1 + in2; \ 1766 \ 1767 out2 = in1 - in2; \ 1768 out3 = in0 - in3; \ 1769 } 1770 1771 /* Description : Butterfly of 8 input vectors 1772 Arguments : Inputs - in0 ... in7 1773 Outputs - out0 .. out7 1774 Details : Butterfly operation 1775 */ 1776 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ 1777 out3, out4, out5, out6, out7) \ 1778 { \ 1779 out0 = in0 + in7; \ 1780 out1 = in1 + in6; \ 1781 out2 = in2 + in5; \ 1782 out3 = in3 + in4; \ 1783 \ 1784 out4 = in3 - in4; \ 1785 out5 = in2 - in5; \ 1786 out6 = in1 - in6; \ 1787 out7 = in0 - in7; \ 1788 } 1789 1790 /* Description : Butterfly of 16 input vectors 1791 Arguments : Inputs - in0 ... in15 1792 Outputs - out0 .. out15 1793 Details : Butterfly operation 1794 */ 1795 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ 1796 in11, in12, in13, in14, in15, out0, out1, out2, out3, \ 1797 out4, out5, out6, out7, out8, out9, out10, out11, out12, \ 1798 out13, out14, out15) \ 1799 { \ 1800 out0 = in0 + in15; \ 1801 out1 = in1 + in14; \ 1802 out2 = in2 + in13; \ 1803 out3 = in3 + in12; \ 1804 out4 = in4 + in11; \ 1805 out5 = in5 + in10; \ 1806 out6 = in6 + in9; \ 1807 out7 = in7 + in8; \ 1808 \ 1809 out8 = in7 - in8; \ 1810 out9 = in6 - in9; \ 1811 out10 = in5 - in10; \ 1812 out11 = in4 - in11; \ 1813 out12 = in3 - in12; \ 1814 out13 = in2 - in13; \ 1815 out14 = in1 - in14; \ 1816 out15 = in0 - in15; \ 1817 } 1818 1819 /* Description : Transpose input 8x8 byte block 1820 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1821 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1822 Return Type - as per RTYPE 1823 */ 1824 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ 1825 out1, out2, out3, out4, out5, out6, out7) \ 1826 { \ 1827 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1828 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1829 \ 1830 ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \ 1831 tmp3_m); \ 1832 ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ 1833 ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ 1834 ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ 1835 ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ 1836 SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ 1837 SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ 1838 } 1839 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) 1840 1841 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors 1842 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 1843 in8, in9, in10, in11, in12, in13, in14, in15 1844 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1845 Return Type - unsigned byte 1846 */ 1847 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ 1848 in10, in11, in12, in13, in14, in15, out0, out1, \ 1849 out2, out3, out4, out5, out6, out7) \ 1850 { \ 1851 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1852 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1853 \ 1854 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ 1855 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ 1856 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ 1857 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ 1858 \ 1859 tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ 1860 tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ 1861 tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ 1862 tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ 1863 out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ 1864 tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ 1865 out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ 1866 tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ 1867 \ 1868 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ 1869 out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1870 out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1871 \ 1872 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ 1873 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ 1874 out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1875 out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1876 \ 1877 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ 1878 out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1879 out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1880 \ 1881 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ 1882 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ 1883 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ 1884 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ 1885 out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1886 out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1887 } 1888 1889 /* Description : Transpose 4x4 block with half word elements in vectors 1890 Arguments : Inputs - in0, in1, in2, in3 1891 Outputs - out0, out1, out2, out3 1892 Return Type - signed halfword 1893 */ 1894 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ 1895 { \ 1896 v8i16 s0_m, s1_m; \ 1897 \ 1898 ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ 1899 ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ 1900 out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ 1901 out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ 1902 } 1903 1904 /* Description : Transpose 4x8 block with half word elements in vectors 1905 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1906 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1907 Return Type - signed halfword 1908 */ 1909 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1910 out2, out3, out4, out5, out6, out7) \ 1911 { \ 1912 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1913 v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ 1914 v8i16 zero_m = { 0 }; \ 1915 \ 1916 ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \ 1917 tmp3_n); \ 1918 ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ 1919 ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ 1920 \ 1921 out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ 1922 out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ 1923 out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ 1924 out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ 1925 \ 1926 out4 = zero_m; \ 1927 out5 = zero_m; \ 1928 out6 = zero_m; \ 1929 out7 = zero_m; \ 1930 } 1931 1932 /* Description : Transpose 8x4 block with half word elements in vectors 1933 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1934 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1935 Return Type - signed halfword 1936 */ 1937 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ 1938 { \ 1939 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1940 \ 1941 ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ 1942 ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ 1943 ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ 1944 ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ 1945 } 1946 1947 /* Description : Transpose 8x8 block with half word elements in vectors 1948 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1949 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1950 Return Type - as per RTYPE 1951 */ 1952 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ 1953 out1, out2, out3, out4, out5, out6, out7) \ 1954 { \ 1955 v8i16 s0_m, s1_m; \ 1956 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1957 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1958 \ 1959 ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ 1960 ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ 1961 ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ 1962 ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ 1963 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 1964 ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ 1965 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 1966 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ 1967 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \ 1968 tmp7_m, out0, out2, out4, out6); \ 1969 out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ 1970 out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ 1971 out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ 1972 out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ 1973 } 1974 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) 1975 1976 /* Description : Transpose 4x4 block with word elements in vectors 1977 Arguments : Inputs - in0, in1, in2, in3 1978 Outputs - out0, out1, out2, out3 1979 Return Type - signed word 1980 */ 1981 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ 1982 { \ 1983 v4i32 s0_m, s1_m, s2_m, s3_m; \ 1984 \ 1985 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ 1986 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ 1987 \ 1988 out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ 1989 out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ 1990 out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ 1991 out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ 1992 } 1993 1994 /* Description : Add block 4x4 1995 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 1996 Details : Least significant 4 bytes from each input vector are added to 1997 the destination bytes, clipped between 0-255 and stored. 1998 */ 1999 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ 2000 { \ 2001 uint32_t src0_m, src1_m, src2_m, src3_m; \ 2002 v8i16 inp0_m, inp1_m, res0_m, res1_m; \ 2003 v16i8 dst0_m = { 0 }; \ 2004 v16i8 dst1_m = { 0 }; \ 2005 v16i8 zero_m = { 0 }; \ 2006 \ 2007 ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ 2008 LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ 2009 INSERT_W2_SB(src0_m, src1_m, dst0_m); \ 2010 INSERT_W2_SB(src2_m, src3_m, dst1_m); \ 2011 ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ 2012 ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ 2013 CLIP_SH2_0_255(res0_m, res1_m); \ 2014 PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ 2015 ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ 2016 } 2017 2018 /* Description : Pack even elements of input vectors & xor with 128 2019 Arguments : Inputs - in0, in1 2020 Output - out_m 2021 Return Type - unsigned byte 2022 Details : Signed byte even elements from 'in0' and 'in1' are packed 2023 together in one vector and the resulting vector is xor'ed with 2024 128 to shift the range from signed to unsigned byte 2025 */ 2026 #define PCKEV_XORI128_UB(in0, in1) \ 2027 ({ \ 2028 v16u8 out_m; \ 2029 \ 2030 out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ 2031 out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ 2032 out_m; \ 2033 }) 2034 2035 /* Description : Converts inputs to unsigned bytes, interleave, average & store 2036 as 8x4 unsigned byte block 2037 Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, 2038 pdst, stride 2039 */ 2040 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \ 2041 pdst, stride) \ 2042 { \ 2043 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2044 \ 2045 tmp0_m = PCKEV_XORI128_UB(in0, in1); \ 2046 tmp1_m = PCKEV_XORI128_UB(in2, in3); \ 2047 ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ 2048 AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ 2049 ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ 2050 } 2051 2052 /* Description : Pack even byte elements and store byte vector in destination 2053 memory 2054 Arguments : Inputs - in0, in1, pdst 2055 */ 2056 #define PCKEV_ST_SB(in0, in1, pdst) \ 2057 { \ 2058 v16i8 tmp_m; \ 2059 \ 2060 tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ 2061 ST_SB(tmp_m, (pdst)); \ 2062 } 2063 2064 /* Description : Horizontal 2 tap filter kernel code 2065 Arguments : Inputs - in0, in1, mask, coeff, shift 2066 */ 2067 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ 2068 ({ \ 2069 v16i8 tmp0_m; \ 2070 v8u16 tmp1_m; \ 2071 \ 2072 tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ 2073 tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ 2074 tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ 2075 \ 2076 tmp1_m; \ 2077 }) 2078 #endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */ 2079