1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ 12 #define VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ 13 14 #include <msa.h> 15 16 #include "./vpx_config.h" 17 #include "vpx/vpx_integer.h" 18 19 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) 20 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__) 21 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__) 22 23 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) 24 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__) 25 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__) 26 27 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) 28 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__) 29 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__) 30 31 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 32 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) 33 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__) 34 35 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 36 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__) 37 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__) 38 39 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 40 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__) 41 42 #if (__mips_isa_rev >= 6) 43 #define LW(psrc) \ 44 ({ \ 45 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 46 uint32_t val_m; \ 47 \ 48 asm volatile("lw %[val_m], %[psrc_m] \n\t" \ 49 \ 50 : [val_m] "=r"(val_m) \ 51 : [psrc_m] "m"(*psrc_m)); \ 52 \ 53 val_m; \ 54 }) 55 56 #if (__mips == 64) 57 #define LD(psrc) \ 58 ({ \ 59 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 60 uint64_t val_m = 0; \ 61 \ 62 asm volatile("ld %[val_m], %[psrc_m] \n\t" \ 63 \ 64 : [val_m] "=r"(val_m) \ 65 : [psrc_m] "m"(*psrc_m)); \ 66 \ 67 val_m; \ 68 }) 69 #else // !(__mips == 64) 70 #define LD(psrc) \ 71 ({ \ 72 const uint8_t *psrc_ld = (const uint8_t *)(psrc); \ 73 uint32_t val0_m, val1_m; \ 74 uint64_t val_m = 0; \ 75 \ 76 val0_m = LW(psrc_ld); \ 77 val1_m = LW(psrc_ld + 4); \ 78 \ 79 val_m = (uint64_t)(val1_m); \ 80 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ 81 val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ 82 \ 83 val_m; \ 84 }) 85 #endif // (__mips == 64) 86 87 #define SH(val, pdst) \ 88 { \ 89 uint8_t *pdst_m = (uint8_t *)(pdst); \ 90 const uint16_t val_m = (val); \ 91 \ 92 asm volatile("sh %[val_m], %[pdst_m] \n\t" \ 93 \ 94 : [pdst_m] "=m"(*pdst_m) \ 95 : [val_m] "r"(val_m)); \ 96 } 97 98 #define SW(val, pdst) \ 99 { \ 100 uint8_t *pdst_m = (uint8_t *)(pdst); \ 101 const uint32_t val_m = (val); \ 102 \ 103 asm volatile("sw %[val_m], %[pdst_m] \n\t" \ 104 \ 105 : [pdst_m] "=m"(*pdst_m) \ 106 : [val_m] "r"(val_m)); \ 107 } 108 109 #define SD(val, pdst) \ 110 { \ 111 uint8_t *pdst_m = (uint8_t *)(pdst); \ 112 const uint64_t val_m = (val); \ 113 \ 114 asm volatile("sd %[val_m], %[pdst_m] \n\t" \ 115 \ 116 : [pdst_m] "=m"(*pdst_m) \ 117 : [val_m] "r"(val_m)); \ 118 } 119 #else // !(__mips_isa_rev >= 6) 120 #define LW(psrc) \ 121 ({ \ 122 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 123 uint32_t val_m; \ 124 \ 125 asm volatile( \ 126 "lwr %[val_m], 0(%[psrc_m]) \n\t" \ 127 "lwl %[val_m], 3(%[psrc_m]) \n\t" \ 128 : [val_m] "=&r"(val_m) \ 129 : [psrc_m] "r"(psrc_m)); \ 130 \ 131 val_m; \ 132 }) 133 134 #if (__mips == 64) 135 #define LD(psrc) \ 136 ({ \ 137 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 138 uint64_t val_m = 0; \ 139 \ 140 asm volatile( \ 141 "ldr %[val_m], 0(%[psrc_m]) \n\t" \ 142 "ldl %[val_m], 7(%[psrc_m]) \n\t" \ 143 : [val_m] "=&r"(val_m) \ 144 : [psrc_m] "r"(psrc_m)); \ 145 \ 146 val_m; \ 147 }) 148 #else // !(__mips == 64) 149 #define LD(psrc) \ 150 ({ \ 151 const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ 152 uint32_t val0_m, val1_m; \ 153 uint64_t val_m = 0; \ 154 \ 155 val0_m = LW(psrc_m1); \ 156 val1_m = LW(psrc_m1 + 4); \ 157 \ 158 val_m = (uint64_t)(val1_m); \ 159 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ 160 val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ 161 \ 162 val_m; \ 163 }) 164 #endif // (__mips == 64) 165 #define SH(val, pdst) \ 166 { \ 167 uint8_t *pdst_m = (uint8_t *)(pdst); \ 168 const uint16_t val_m = (val); \ 169 \ 170 asm volatile("ush %[val_m], %[pdst_m] \n\t" \ 171 \ 172 : [pdst_m] "=m"(*pdst_m) \ 173 : [val_m] "r"(val_m)); \ 174 } 175 176 #define SW(val, pdst) \ 177 { \ 178 uint8_t *pdst_m = (uint8_t *)(pdst); \ 179 const uint32_t val_m = (val); \ 180 \ 181 asm volatile("usw %[val_m], %[pdst_m] \n\t" \ 182 \ 183 : [pdst_m] "=m"(*pdst_m) \ 184 : [val_m] "r"(val_m)); \ 185 } 186 187 #define SD(val, pdst) \ 188 { \ 189 uint8_t *pdst_m1 = (uint8_t *)(pdst); \ 190 uint32_t val0_m, val1_m; \ 191 \ 192 val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ 193 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ 194 \ 195 SW(val0_m, pdst_m1); \ 196 SW(val1_m, pdst_m1 + 4); \ 197 } 198 #endif // (__mips_isa_rev >= 6) 199 200 /* Description : Load 4 words with stride 201 Arguments : Inputs - psrc, stride 202 Outputs - out0, out1, out2, out3 203 Details : Load word in 'out0' from (psrc) 204 Load word in 'out1' from (psrc + stride) 205 Load word in 'out2' from (psrc + 2 * stride) 206 Load word in 'out3' from (psrc + 3 * stride) 207 */ 208 #define LW4(psrc, stride, out0, out1, out2, out3) \ 209 { \ 210 out0 = LW((psrc)); \ 211 out1 = LW((psrc) + stride); \ 212 out2 = LW((psrc) + 2 * stride); \ 213 out3 = LW((psrc) + 3 * stride); \ 214 } 215 216 /* Description : Load double words with stride 217 Arguments : Inputs - psrc, stride 218 Outputs - out0, out1 219 Details : Load double word in 'out0' from (psrc) 220 Load double word in 'out1' from (psrc + stride) 221 */ 222 #define LD2(psrc, stride, out0, out1) \ 223 { \ 224 out0 = LD((psrc)); \ 225 out1 = LD((psrc) + stride); \ 226 } 227 #define LD4(psrc, stride, out0, out1, out2, out3) \ 228 { \ 229 LD2((psrc), stride, out0, out1); \ 230 LD2((psrc) + 2 * stride, stride, out2, out3); \ 231 } 232 233 /* Description : Store 4 words with stride 234 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 235 Details : Store word from 'in0' to (pdst) 236 Store word from 'in1' to (pdst + stride) 237 Store word from 'in2' to (pdst + 2 * stride) 238 Store word from 'in3' to (pdst + 3 * stride) 239 */ 240 #define SW4(in0, in1, in2, in3, pdst, stride) \ 241 { \ 242 SW(in0, (pdst)); \ 243 SW(in1, (pdst) + stride); \ 244 SW(in2, (pdst) + 2 * stride); \ 245 SW(in3, (pdst) + 3 * stride); \ 246 } 247 248 /* Description : Store 4 double words with stride 249 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 250 Details : Store double word from 'in0' to (pdst) 251 Store double word from 'in1' to (pdst + stride) 252 Store double word from 'in2' to (pdst + 2 * stride) 253 Store double word from 'in3' to (pdst + 3 * stride) 254 */ 255 #define SD4(in0, in1, in2, in3, pdst, stride) \ 256 { \ 257 SD(in0, (pdst)); \ 258 SD(in1, (pdst) + stride); \ 259 SD(in2, (pdst) + 2 * stride); \ 260 SD(in3, (pdst) + 3 * stride); \ 261 } 262 263 /* Description : Load vectors with 16 byte elements with stride 264 Arguments : Inputs - psrc, stride 265 Outputs - out0, out1 266 Return Type - as per RTYPE 267 Details : Load 16 byte elements in 'out0' from (psrc) 268 Load 16 byte elements in 'out1' from (psrc + stride) 269 */ 270 #define LD_B2(RTYPE, psrc, stride, out0, out1) \ 271 { \ 272 out0 = LD_B(RTYPE, (psrc)); \ 273 out1 = LD_B(RTYPE, (psrc) + stride); \ 274 } 275 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) 276 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) 277 278 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ 279 { \ 280 LD_B2(RTYPE, (psrc), stride, out0, out1); \ 281 out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ 282 } 283 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) 284 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__) 285 286 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 287 { \ 288 LD_B2(RTYPE, (psrc), stride, out0, out1); \ 289 LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ 290 } 291 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) 292 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) 293 294 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ 295 { \ 296 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 297 out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ 298 } 299 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) 300 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) 301 302 #define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ 303 out7) \ 304 { \ 305 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 306 LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ 307 } 308 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) 309 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) 310 311 /* Description : Load vectors with 8 halfword elements with stride 312 Arguments : Inputs - psrc, stride 313 Outputs - out0, out1 314 Details : Load 8 halfword elements in 'out0' from (psrc) 315 Load 8 halfword elements in 'out1' from (psrc + stride) 316 */ 317 #define LD_H2(RTYPE, psrc, stride, out0, out1) \ 318 { \ 319 out0 = LD_H(RTYPE, (psrc)); \ 320 out1 = LD_H(RTYPE, (psrc) + (stride)); \ 321 } 322 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) 323 324 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 325 { \ 326 LD_H2(RTYPE, (psrc), stride, out0, out1); \ 327 LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ 328 } 329 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) 330 331 /* Description : Load 2 vectors of signed word elements with stride 332 Arguments : Inputs - psrc, stride 333 Outputs - out0, out1 334 Return Type - signed word 335 */ 336 #define LD_SW2(psrc, stride, out0, out1) \ 337 { \ 338 out0 = LD_SW((psrc)); \ 339 out1 = LD_SW((psrc) + stride); \ 340 } 341 342 /* Description : Store vectors of 16 byte elements with stride 343 Arguments : Inputs - in0, in1, pdst, stride 344 Details : Store 16 byte elements from 'in0' to (pdst) 345 Store 16 byte elements from 'in1' to (pdst + stride) 346 */ 347 #define ST_B2(RTYPE, in0, in1, pdst, stride) \ 348 { \ 349 ST_B(RTYPE, in0, (pdst)); \ 350 ST_B(RTYPE, in1, (pdst) + stride); \ 351 } 352 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) 353 354 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ 355 { \ 356 ST_B2(RTYPE, in0, in1, (pdst), stride); \ 357 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 358 } 359 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) 360 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__) 361 362 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 363 { \ 364 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ 365 ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ 366 } 367 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) 368 369 /* Description : Store vectors of 8 halfword elements with stride 370 Arguments : Inputs - in0, in1, pdst, stride 371 Details : Store 8 halfword elements from 'in0' to (pdst) 372 Store 8 halfword elements from 'in1' to (pdst + stride) 373 */ 374 #define ST_H2(RTYPE, in0, in1, pdst, stride) \ 375 { \ 376 ST_H(RTYPE, in0, (pdst)); \ 377 ST_H(RTYPE, in1, (pdst) + stride); \ 378 } 379 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) 380 381 /* Description : Store vectors of word elements with stride 382 Arguments : Inputs - in0, in1, pdst, stride 383 Details : Store 4 word elements from 'in0' to (pdst) 384 Store 4 word elements from 'in1' to (pdst + stride) 385 */ 386 #define ST_SW2(in0, in1, pdst, stride) \ 387 { \ 388 ST_SW(in0, (pdst)); \ 389 ST_SW(in1, (pdst) + stride); \ 390 } 391 392 /* Description : Store 2x4 byte block to destination memory from input vector 393 Arguments : Inputs - in, stidx, pdst, stride 394 Details : Index 'stidx' halfword element from 'in' vector is copied to 395 the GP register and stored to (pdst) 396 Index 'stidx+1' halfword element from 'in' vector is copied to 397 the GP register and stored to (pdst + stride) 398 Index 'stidx+2' halfword element from 'in' vector is copied to 399 the GP register and stored to (pdst + 2 * stride) 400 Index 'stidx+3' halfword element from 'in' vector is copied to 401 the GP register and stored to (pdst + 3 * stride) 402 */ 403 #define ST2x4_UB(in, stidx, pdst, stride) \ 404 { \ 405 uint16_t out0_m, out1_m, out2_m, out3_m; \ 406 uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ 407 \ 408 out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ 409 out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ 410 out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ 411 out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ 412 \ 413 SH(out0_m, pblk_2x4_m); \ 414 SH(out1_m, pblk_2x4_m + stride); \ 415 SH(out2_m, pblk_2x4_m + 2 * stride); \ 416 SH(out3_m, pblk_2x4_m + 3 * stride); \ 417 } 418 419 /* Description : Store 4x4 byte block to destination memory from input vector 420 Arguments : Inputs - in0, in1, pdst, stride 421 Details : 'Idx0' word element from input vector 'in0' is copied to the 422 GP register and stored to (pdst) 423 'Idx1' word element from input vector 'in0' is copied to the 424 GP register and stored to (pdst + stride) 425 'Idx2' word element from input vector 'in0' is copied to the 426 GP register and stored to (pdst + 2 * stride) 427 'Idx3' word element from input vector 'in0' is copied to the 428 GP register and stored to (pdst + 3 * stride) 429 */ 430 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ 431 { \ 432 uint32_t out0_m, out1_m, out2_m, out3_m; \ 433 uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ 434 \ 435 out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ 436 out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ 437 out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ 438 out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ 439 \ 440 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ 441 } 442 #define ST4x8_UB(in0, in1, pdst, stride) \ 443 { \ 444 uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ 445 \ 446 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ 447 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ 448 } 449 450 /* Description : Store 8x1 byte block to destination memory from input vector 451 Arguments : Inputs - in, pdst 452 Details : Index 0 double word element from 'in' vector is copied to the 453 GP register and stored to (pdst) 454 */ 455 #define ST8x1_UB(in, pdst) \ 456 { \ 457 uint64_t out0_m; \ 458 \ 459 out0_m = __msa_copy_u_d((v2i64)in, 0); \ 460 SD(out0_m, pdst); \ 461 } 462 463 /* Description : Store 8x2 byte block to destination memory from input vector 464 Arguments : Inputs - in, pdst, stride 465 Details : Index 0 double word element from 'in' vector is copied to the 466 GP register and stored to (pdst) 467 Index 1 double word element from 'in' vector is copied to the 468 GP register and stored to (pdst + stride) 469 */ 470 #define ST8x2_UB(in, pdst, stride) \ 471 { \ 472 uint64_t out0_m, out1_m; \ 473 uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ 474 \ 475 out0_m = __msa_copy_u_d((v2i64)in, 0); \ 476 out1_m = __msa_copy_u_d((v2i64)in, 1); \ 477 \ 478 SD(out0_m, pblk_8x2_m); \ 479 SD(out1_m, pblk_8x2_m + stride); \ 480 } 481 482 /* Description : Store 8x4 byte block to destination memory from input 483 vectors 484 Arguments : Inputs - in0, in1, pdst, stride 485 Details : Index 0 double word element from 'in0' vector is copied to the 486 GP register and stored to (pdst) 487 Index 1 double word element from 'in0' vector is copied to the 488 GP register and stored to (pdst + stride) 489 Index 0 double word element from 'in1' vector is copied to the 490 GP register and stored to (pdst + 2 * stride) 491 Index 1 double word element from 'in1' vector is copied to the 492 GP register and stored to (pdst + 3 * stride) 493 */ 494 #define ST8x4_UB(in0, in1, pdst, stride) \ 495 { \ 496 uint64_t out0_m, out1_m, out2_m, out3_m; \ 497 uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ 498 \ 499 out0_m = __msa_copy_u_d((v2i64)in0, 0); \ 500 out1_m = __msa_copy_u_d((v2i64)in0, 1); \ 501 out2_m = __msa_copy_u_d((v2i64)in1, 0); \ 502 out3_m = __msa_copy_u_d((v2i64)in1, 1); \ 503 \ 504 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ 505 } 506 507 /* Description : Immediate number of elements to slide with zero 508 Arguments : Inputs - in0, in1, slide_val 509 Outputs - out0, out1 510 Return Type - as per RTYPE 511 Details : Byte elements from 'zero_m' vector are slid into 'in0' by 512 value specified in the 'slide_val' 513 */ 514 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ 515 { \ 516 v16i8 zero_m = { 0 }; \ 517 \ 518 out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ 519 out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ 520 } 521 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__) 522 523 /* Description : Immediate number of elements to slide 524 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val 525 Outputs - out0, out1 526 Return Type - as per RTYPE 527 Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by 528 value specified in the 'slide_val' 529 */ 530 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ 531 { \ 532 out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ 533 out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ 534 } 535 536 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \ 537 out2, slide_val) \ 538 { \ 539 SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val); \ 540 out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ 541 } 542 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) 543 544 /* Description : Shuffle byte vector elements as per mask vector 545 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 546 Outputs - out0, out1 547 Return Type - as per RTYPE 548 Details : Byte elements from 'in0' & 'in1' are copied selectively to 549 'out0' as per control vector 'mask0' 550 */ 551 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 552 { \ 553 out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ 554 out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ 555 } 556 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) 557 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) 558 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) 559 560 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \ 561 out0, out1, out2) \ 562 { \ 563 VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \ 564 out2 = (RTYPE)__msa_vshf_b((v16i8)mask2, (v16i8)in5, (v16i8)in4); \ 565 } 566 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__) 567 568 /* Description : Shuffle halfword vector elements as per mask vector 569 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 570 Outputs - out0, out1 571 Return Type - as per RTYPE 572 Details : halfword elements from 'in0' & 'in1' are copied selectively to 573 'out0' as per control vector 'mask0' 574 */ 575 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 576 { \ 577 out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \ 578 out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \ 579 } 580 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__) 581 582 /* Description : Dot product of byte vector elements 583 Arguments : Inputs - mult0, mult1, cnst0, cnst1 584 Outputs - out0, out1 585 Return Type - as per RTYPE 586 Details : Unsigned byte elements from 'mult0' are multiplied with 587 unsigned byte elements from 'cnst0' producing a result 588 twice the size of input i.e. unsigned halfword. 589 The multiplication result of adjacent odd-even elements 590 are added together and written to the 'out0' vector 591 */ 592 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 593 { \ 594 out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ 595 out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ 596 } 597 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) 598 599 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 600 cnst3, out0, out1, out2, out3) \ 601 { \ 602 DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 603 DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 604 } 605 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) 606 607 /* Description : Dot product of byte vector elements 608 Arguments : Inputs - mult0, mult1, cnst0, cnst1 609 Outputs - out0, out1 610 Return Type - as per RTYPE 611 Details : Signed byte elements from 'mult0' are multiplied with 612 signed byte elements from 'cnst0' producing a result 613 twice the size of input i.e. signed halfword. 614 The multiplication result of adjacent odd-even elements 615 are added together and written to the 'out0' vector 616 */ 617 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 618 { \ 619 out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ 620 out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ 621 } 622 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) 623 624 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 625 cnst3, out0, out1, out2, out3) \ 626 { \ 627 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 628 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 629 } 630 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) 631 632 /* Description : Dot product of halfword vector elements 633 Arguments : Inputs - mult0, mult1, cnst0, cnst1 634 Outputs - out0, out1 635 Return Type - as per RTYPE 636 Details : Signed halfword elements from 'mult0' are multiplied with 637 signed halfword elements from 'cnst0' producing a result 638 twice the size of input i.e. signed word. 639 The multiplication result of adjacent odd-even elements 640 are added together and written to the 'out0' vector 641 */ 642 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 643 { \ 644 out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ 645 out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ 646 } 647 648 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 649 cnst3, out0, out1, out2, out3) \ 650 { \ 651 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 652 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 653 } 654 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) 655 656 /* Description : Dot product of word vector elements 657 Arguments : Inputs - mult0, mult1, cnst0, cnst1 658 Outputs - out0, out1 659 Return Type - as per RTYPE 660 Details : Signed word elements from 'mult0' are multiplied with 661 signed word elements from 'cnst0' producing a result 662 twice the size of input i.e. signed double word. 663 The multiplication result of adjacent odd-even elements 664 are added together and written to the 'out0' vector 665 */ 666 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 667 { \ 668 out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ 669 out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ 670 } 671 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) 672 673 /* Description : Dot product & addition of byte vector elements 674 Arguments : Inputs - mult0, mult1, cnst0, cnst1 675 Outputs - out0, out1 676 Return Type - as per RTYPE 677 Details : Signed byte elements from 'mult0' are multiplied with 678 signed byte elements from 'cnst0' producing a result 679 twice the size of input i.e. signed halfword. 680 The multiplication result of adjacent odd-even elements 681 are added to the 'out0' vector 682 */ 683 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 684 { \ 685 out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ 686 out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ 687 } 688 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) 689 690 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 691 cnst3, out0, out1, out2, out3) \ 692 { \ 693 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 694 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 695 } 696 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) 697 698 /* Description : Dot product & addition of halfword vector elements 699 Arguments : Inputs - mult0, mult1, cnst0, cnst1 700 Outputs - out0, out1 701 Return Type - as per RTYPE 702 Details : Signed halfword elements from 'mult0' are multiplied with 703 signed halfword elements from 'cnst0' producing a result 704 twice the size of input i.e. signed word. 705 The multiplication result of adjacent odd-even elements 706 are added to the 'out0' vector 707 */ 708 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 709 { \ 710 out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ 711 out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ 712 } 713 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) 714 715 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 716 cnst3, out0, out1, out2, out3) \ 717 { \ 718 DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 719 DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 720 } 721 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__) 722 723 /* Description : Dot product & addition of double word vector elements 724 Arguments : Inputs - mult0, mult1 725 Outputs - out0, out1 726 Return Type - as per RTYPE 727 Details : Each signed word element from 'mult0' is multiplied with itself 728 producing an intermediate result twice the size of it 729 i.e. signed double word 730 The multiplication result of adjacent odd-even elements 731 are added to the 'out0' vector 732 */ 733 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ 734 { \ 735 out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ 736 out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ 737 } 738 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) 739 740 /* Description : Clips all signed halfword elements of input vector 741 between 0 & 255 742 Arguments : Input - in 743 Output - out_m 744 Return Type - signed halfword 745 */ 746 #define CLIP_SH_0_255(in) \ 747 ({ \ 748 v8i16 max_m = __msa_ldi_h(255); \ 749 v8i16 out_m; \ 750 \ 751 out_m = __msa_maxi_s_h((v8i16)in, 0); \ 752 out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ 753 out_m; \ 754 }) 755 #define CLIP_SH2_0_255(in0, in1) \ 756 { \ 757 in0 = CLIP_SH_0_255(in0); \ 758 in1 = CLIP_SH_0_255(in1); \ 759 } 760 #define CLIP_SH4_0_255(in0, in1, in2, in3) \ 761 { \ 762 CLIP_SH2_0_255(in0, in1); \ 763 CLIP_SH2_0_255(in2, in3); \ 764 } 765 766 /* Description : Clips all signed word elements of input vector 767 between 0 & 255 768 Arguments : Input - in 769 Output - out_m 770 Return Type - signed word 771 */ 772 #define CLIP_SW_0_255(in) \ 773 ({ \ 774 v4i32 max_m = __msa_ldi_w(255); \ 775 v4i32 out_m; \ 776 \ 777 out_m = __msa_maxi_s_w((v4i32)in, 0); \ 778 out_m = __msa_min_s_w((v4i32)max_m, (v4i32)out_m); \ 779 out_m; \ 780 }) 781 782 /* Description : Horizontal addition of 4 signed word elements of input vector 783 Arguments : Input - in (signed word vector) 784 Output - sum_m (i32 sum) 785 Return Type - signed word (GP) 786 Details : 4 signed word elements of 'in' vector are added together and 787 the resulting integer sum is returned 788 */ 789 #define HADD_SW_S32(in) \ 790 ({ \ 791 v2i64 res0_m, res1_m; \ 792 int32_t sum_m; \ 793 \ 794 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ 795 res1_m = __msa_splati_d(res0_m, 1); \ 796 res0_m = res0_m + res1_m; \ 797 sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ 798 sum_m; \ 799 }) 800 801 /* Description : Horizontal addition of 8 unsigned halfword elements 802 Arguments : Inputs - in (unsigned halfword vector) 803 Outputs - sum_m (u32 sum) 804 Return Type - unsigned word 805 Details : 8 unsigned halfword elements of input vector are added 806 together and the resulting integer sum is returned 807 */ 808 #define HADD_UH_U32(in) \ 809 ({ \ 810 v4u32 res_m; \ 811 v2u64 res0_m, res1_m; \ 812 uint32_t sum_m; \ 813 \ 814 res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ 815 res0_m = __msa_hadd_u_d(res_m, res_m); \ 816 res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ 817 res0_m = res0_m + res1_m; \ 818 sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ 819 sum_m; \ 820 }) 821 822 /* Description : Horizontal addition of unsigned byte vector elements 823 Arguments : Inputs - in0, in1 824 Outputs - out0, out1 825 Return Type - as per RTYPE 826 Details : Each unsigned odd byte element from 'in0' is added to 827 even unsigned byte element from 'in0' (pairwise) and the 828 halfword result is written to 'out0' 829 */ 830 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \ 831 { \ 832 out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ 833 out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ 834 } 835 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) 836 837 /* Description : Horizontal subtraction of unsigned byte vector elements 838 Arguments : Inputs - in0, in1 839 Outputs - out0, out1 840 Return Type - as per RTYPE 841 Details : Each unsigned odd byte element from 'in0' is subtracted from 842 even unsigned byte element from 'in0' (pairwise) and the 843 halfword result is written to 'out0' 844 */ 845 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ 846 { \ 847 out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ 848 out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ 849 } 850 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) 851 852 /* Description : Horizontal subtraction of signed halfword vector elements 853 Arguments : Inputs - in0, in1 854 Outputs - out0, out1 855 Return Type - as per RTYPE 856 Details : Each signed odd halfword element from 'in0' is subtracted from 857 even signed halfword element from 'in0' (pairwise) and the 858 word result is written to 'out0' 859 */ 860 #define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ 861 { \ 862 out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ 863 out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ 864 } 865 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) 866 867 /* Description : Set element n input vector to GPR value 868 Arguments : Inputs - in0, in1, in2, in3 869 Output - out 870 Return Type - as per RTYPE 871 Details : Set element 0 in vector 'out' to value specified in 'in0' 872 */ 873 #define INSERT_D2(RTYPE, in0, in1, out) \ 874 { \ 875 out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ 876 out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ 877 } 878 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) 879 880 /* Description : Interleave even byte elements from vectors 881 Arguments : Inputs - in0, in1, in2, in3 882 Outputs - out0, out1 883 Return Type - as per RTYPE 884 Details : Even byte elements of 'in0' and 'in1' are interleaved 885 and written to 'out0' 886 */ 887 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 888 { \ 889 out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ 890 out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ 891 } 892 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) 893 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) 894 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__) 895 896 /* Description : Interleave even halfword elements from vectors 897 Arguments : Inputs - in0, in1, in2, in3 898 Outputs - out0, out1 899 Return Type - as per RTYPE 900 Details : Even halfword elements of 'in0' and 'in1' are interleaved 901 and written to 'out0' 902 */ 903 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 904 { \ 905 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ 906 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ 907 } 908 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) 909 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) 910 911 /* Description : Interleave even word elements from vectors 912 Arguments : Inputs - in0, in1, in2, in3 913 Outputs - out0, out1 914 Return Type - as per RTYPE 915 Details : Even word elements of 'in0' and 'in1' are interleaved 916 and written to 'out0' 917 */ 918 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 919 { \ 920 out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ 921 out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ 922 } 923 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__) 924 925 /* Description : Interleave even double word elements from vectors 926 Arguments : Inputs - in0, in1, in2, in3 927 Outputs - out0, out1 928 Return Type - as per RTYPE 929 Details : Even double word elements of 'in0' and 'in1' are interleaved 930 and written to 'out0' 931 */ 932 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 933 { \ 934 out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ 935 out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ 936 } 937 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) 938 939 /* Description : Interleave left half of byte elements from vectors 940 Arguments : Inputs - in0, in1, in2, in3 941 Outputs - out0, out1 942 Return Type - as per RTYPE 943 Details : Left half of byte elements of 'in0' and 'in1' are interleaved 944 and written to 'out0'. 945 */ 946 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 947 { \ 948 out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 949 out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ 950 } 951 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) 952 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) 953 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) 954 955 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 956 out2, out3) \ 957 { \ 958 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 959 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 960 } 961 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) 962 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) 963 964 /* Description : Interleave left half of halfword elements from vectors 965 Arguments : Inputs - in0, in1, in2, in3 966 Outputs - out0, out1 967 Return Type - as per RTYPE 968 Details : Left half of halfword elements of 'in0' and 'in1' are 969 interleaved and written to 'out0'. 970 */ 971 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 972 { \ 973 out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 974 out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ 975 } 976 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) 977 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) 978 979 /* Description : Interleave left half of word elements from vectors 980 Arguments : Inputs - in0, in1, in2, in3 981 Outputs - out0, out1 982 Return Type - as per RTYPE 983 Details : Left half of word elements of 'in0' and 'in1' are interleaved 984 and written to 'out0'. 985 */ 986 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 987 { \ 988 out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 989 out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ 990 } 991 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) 992 993 /* Description : Interleave right half of byte elements from vectors 994 Arguments : Inputs - in0, in1, in2, in3 995 Outputs - out0, out1 996 Return Type - as per RTYPE 997 Details : Right half of byte elements of 'in0' and 'in1' are interleaved 998 and written to out0. 999 */ 1000 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1001 { \ 1002 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 1003 out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ 1004 } 1005 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) 1006 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) 1007 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) 1008 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__) 1009 1010 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1011 out2, out3) \ 1012 { \ 1013 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1014 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1015 } 1016 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) 1017 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) 1018 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) 1019 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) 1020 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__) 1021 1022 /* Description : Interleave right half of halfword elements from vectors 1023 Arguments : Inputs - in0, in1, in2, in3 1024 Outputs - out0, out1 1025 Return Type - as per RTYPE 1026 Details : Right half of halfword elements of 'in0' and 'in1' are 1027 interleaved and written to 'out0'. 1028 */ 1029 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1030 { \ 1031 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 1032 out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ 1033 } 1034 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) 1035 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) 1036 1037 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1038 out2, out3) \ 1039 { \ 1040 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1041 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1042 } 1043 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) 1044 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__) 1045 1046 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1047 { \ 1048 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 1049 out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ 1050 } 1051 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) 1052 1053 /* Description : Interleave right half of double word elements from vectors 1054 Arguments : Inputs - in0, in1, in2, in3 1055 Outputs - out0, out1 1056 Return Type - as per RTYPE 1057 Details : Right half of double word elements of 'in0' and 'in1' are 1058 interleaved and written to 'out0'. 1059 */ 1060 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1061 { \ 1062 out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ 1063 out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ 1064 } 1065 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) 1066 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) 1067 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) 1068 1069 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1070 out2, out3) \ 1071 { \ 1072 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1073 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1074 } 1075 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) 1076 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) 1077 1078 /* Description : Interleave both left and right half of input vectors 1079 Arguments : Inputs - in0, in1 1080 Outputs - out0, out1 1081 Return Type - as per RTYPE 1082 Details : Right half of byte elements from 'in0' and 'in1' are 1083 interleaved and written to 'out0' 1084 */ 1085 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ 1086 { \ 1087 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 1088 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 1089 } 1090 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) 1091 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) 1092 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) 1093 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) 1094 1095 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ 1096 { \ 1097 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 1098 out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 1099 } 1100 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) 1101 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) 1102 1103 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ 1104 { \ 1105 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 1106 out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 1107 } 1108 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) 1109 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) 1110 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) 1111 1112 /* Description : Maximum values between signed elements of vector and 1113 5-bit signed immediate value are copied to the output vector 1114 Arguments : Inputs - in0, in1, in2, in3, max_val 1115 Outputs - in place operation 1116 Return Type - unsigned halfword 1117 Details : Maximum of signed halfword element values from 'in0' and 1118 'max_val' are written in place 1119 */ 1120 #define MAXI_SH2(RTYPE, in0, in1, max_val) \ 1121 { \ 1122 in0 = (RTYPE)__msa_maxi_s_h((v8i16)in0, (max_val)); \ 1123 in1 = (RTYPE)__msa_maxi_s_h((v8i16)in1, (max_val)); \ 1124 } 1125 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__) 1126 1127 /* Description : Saturate the halfword element values to the max 1128 unsigned value of (sat_val + 1) bits 1129 The element data width remains unchanged 1130 Arguments : Inputs - in0, in1, sat_val 1131 Outputs - in place operation 1132 Return Type - as per RTYPE 1133 Details : Each unsigned halfword element from 'in0' is saturated to the 1134 value generated with (sat_val + 1) bit range. 1135 The results are written in place 1136 */ 1137 #define SAT_UH2(RTYPE, in0, in1, sat_val) \ 1138 { \ 1139 in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ 1140 in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ 1141 } 1142 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__) 1143 1144 /* Description : Saturate the halfword element values to the max 1145 unsigned value of (sat_val + 1) bits 1146 The element data width remains unchanged 1147 Arguments : Inputs - in0, in1, sat_val 1148 Outputs - in place operation 1149 Return Type - as per RTYPE 1150 Details : Each unsigned halfword element from 'in0' is saturated to the 1151 value generated with (sat_val + 1) bit range 1152 The results are written in place 1153 */ 1154 #define SAT_SH2(RTYPE, in0, in1, sat_val) \ 1155 { \ 1156 in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ 1157 in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ 1158 } 1159 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) 1160 1161 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ 1162 { \ 1163 SAT_SH2(RTYPE, in0, in1, sat_val); \ 1164 SAT_SH2(RTYPE, in2, in3, sat_val); \ 1165 } 1166 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) 1167 1168 /* Description : Indexed halfword element values are replicated to all 1169 elements in output vector 1170 Arguments : Inputs - in, idx0, idx1 1171 Outputs - out0, out1 1172 Return Type - as per RTYPE 1173 Details : 'idx0' element value from 'in' vector is replicated to all 1174 elements in 'out0' vector 1175 Valid index range for halfword operation is 0-7 1176 */ 1177 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ 1178 { \ 1179 out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ 1180 out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ 1181 } 1182 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__) 1183 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) 1184 1185 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, out0, out1, out2) \ 1186 { \ 1187 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ 1188 out2 = (RTYPE)__msa_splati_h((v8i16)in, idx2); \ 1189 } 1190 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__) 1191 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__) 1192 1193 /* Description : Indexed word element values are replicated to all 1194 elements in output vector 1195 Arguments : Inputs - in, stidx 1196 Outputs - out0, out1 1197 Return Type - as per RTYPE 1198 Details : 'stidx' element value from 'in' vector is replicated to all 1199 elements in 'out0' vector 1200 'stidx + 1' element value from 'in' vector is replicated to all 1201 elements in 'out1' vector 1202 Valid index range for word operation is 0-3 1203 */ 1204 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \ 1205 { \ 1206 out0 = (RTYPE)__msa_splati_w((v4i32)in, stidx); \ 1207 out1 = (RTYPE)__msa_splati_w((v4i32)in, (stidx + 1)); \ 1208 } 1209 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__) 1210 1211 /* Description : Pack even byte elements of vector pairs 1212 Arguments : Inputs - in0, in1, in2, in3 1213 Outputs - out0, out1 1214 Return Type - as per RTYPE 1215 Details : Even byte elements of 'in0' are copied to the left half of 1216 'out0' & even byte elements of 'in1' are copied to the right 1217 half of 'out0'. 1218 */ 1219 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1220 { \ 1221 out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ 1222 out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ 1223 } 1224 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) 1225 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) 1226 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) 1227 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__) 1228 1229 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1230 out2, out3) \ 1231 { \ 1232 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1233 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1234 } 1235 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) 1236 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) 1237 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) 1238 1239 /* Description : Pack even halfword elements of vector pairs 1240 Arguments : Inputs - in0, in1, in2, in3 1241 Outputs - out0, out1 1242 Return Type - as per RTYPE 1243 Details : Even halfword elements of 'in0' are copied to the left half of 1244 'out0' & even halfword elements of 'in1' are copied to the 1245 right half of 'out0'. 1246 */ 1247 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1248 { \ 1249 out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ 1250 out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ 1251 } 1252 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) 1253 1254 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1255 out2, out3) \ 1256 { \ 1257 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1258 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1259 } 1260 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) 1261 1262 /* Description : Pack even double word elements of vector pairs 1263 Arguments : Inputs - in0, in1, in2, in3 1264 Outputs - out0, out1 1265 Return Type - as per RTYPE 1266 Details : Even double elements of 'in0' are copied to the left half of 1267 'out0' & even double elements of 'in1' are copied to the right 1268 half of 'out0'. 1269 */ 1270 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1271 { \ 1272 out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ 1273 out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ 1274 } 1275 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) 1276 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) 1277 1278 /* Description : Pack odd double word elements of vector pairs 1279 Arguments : Inputs - in0, in1, in2, in3 1280 Outputs - out0, out1 1281 Return Type - as per RTYPE 1282 Details : Odd double word elements of 'in0' are copied to the left half 1283 of 'out0' & odd double word elements of 'in1' are copied to 1284 the right half of 'out0'. 1285 */ 1286 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1287 { \ 1288 out0 = (RTYPE)__msa_pckod_d((v2i64)in0, (v2i64)in1); \ 1289 out1 = (RTYPE)__msa_pckod_d((v2i64)in2, (v2i64)in3); \ 1290 } 1291 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__) 1292 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__) 1293 1294 /* Description : Each byte element is logically xor'ed with immediate 128 1295 Arguments : Inputs - in0, in1 1296 Outputs - in place operation 1297 Return Type - as per RTYPE 1298 Details : Each unsigned byte element from input vector 'in0' is 1299 logically xor'ed with 128 and the result is stored in-place. 1300 */ 1301 #define XORI_B2_128(RTYPE, in0, in1) \ 1302 { \ 1303 in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ 1304 in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ 1305 } 1306 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) 1307 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) 1308 1309 #define XORI_B3_128(RTYPE, in0, in1, in2) \ 1310 { \ 1311 XORI_B2_128(RTYPE, in0, in1); \ 1312 in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ 1313 } 1314 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) 1315 1316 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ 1317 { \ 1318 XORI_B2_128(RTYPE, in0, in1); \ 1319 XORI_B2_128(RTYPE, in2, in3); \ 1320 } 1321 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) 1322 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) 1323 1324 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \ 1325 { \ 1326 XORI_B3_128(RTYPE, in0, in1, in2); \ 1327 XORI_B2_128(RTYPE, in3, in4); \ 1328 } 1329 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__) 1330 1331 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \ 1332 { \ 1333 XORI_B4_128(RTYPE, in0, in1, in2, in3); \ 1334 XORI_B4_128(RTYPE, in4, in5, in6, in7); \ 1335 } 1336 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__) 1337 1338 /* Description : Shift left all elements of vector (generic for all data types) 1339 Arguments : Inputs - in0, in1, in2, in3, shift 1340 Outputs - in place operation 1341 Return Type - as per input vector RTYPE 1342 Details : Each element of vector 'in0' is left shifted by 'shift' and 1343 the result is written in-place. 1344 */ 1345 #define SLLI_4V(in0, in1, in2, in3, shift) \ 1346 { \ 1347 in0 = in0 << shift; \ 1348 in1 = in1 << shift; \ 1349 in2 = in2 << shift; \ 1350 in3 = in3 << shift; \ 1351 } 1352 1353 /* Description : Arithmetic shift right all elements of vector 1354 (generic for all data types) 1355 Arguments : Inputs - in0, in1, in2, in3, shift 1356 Outputs - in place operation 1357 Return Type - as per input vector RTYPE 1358 Details : Each element of vector 'in0' is right shifted by 'shift' and 1359 the result is written in-place. 'shift' is a GP variable. 1360 */ 1361 #define SRA_4V(in0, in1, in2, in3, shift) \ 1362 { \ 1363 in0 = in0 >> shift; \ 1364 in1 = in1 >> shift; \ 1365 in2 = in2 >> shift; \ 1366 in3 = in3 >> shift; \ 1367 } 1368 1369 /* Description : Shift right arithmetic rounded words 1370 Arguments : Inputs - in0, in1, shift 1371 Outputs - in place operation 1372 Return Type - as per RTYPE 1373 Details : Each element of vector 'in0' is shifted right arithmetically by 1374 the number of bits in the corresponding element in the vector 1375 'shift'. The last discarded bit is added to shifted value for 1376 rounding and the result is written in-place. 1377 'shift' is a vector. 1378 */ 1379 #define SRAR_W2(RTYPE, in0, in1, shift) \ 1380 { \ 1381 in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ 1382 in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ 1383 } 1384 1385 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ 1386 { \ 1387 SRAR_W2(RTYPE, in0, in1, shift); \ 1388 SRAR_W2(RTYPE, in2, in3, shift); \ 1389 } 1390 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) 1391 1392 /* Description : Shift right arithmetic rounded (immediate) 1393 Arguments : Inputs - in0, in1, shift 1394 Outputs - in place operation 1395 Return Type - as per RTYPE 1396 Details : Each element of vector 'in0' is shifted right arithmetically by 1397 the value in 'shift'. The last discarded bit is added to the 1398 shifted value for rounding and the result is written in-place. 1399 'shift' is an immediate value. 1400 */ 1401 #define SRARI_H2(RTYPE, in0, in1, shift) \ 1402 { \ 1403 in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ 1404 in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ 1405 } 1406 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) 1407 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) 1408 1409 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ 1410 { \ 1411 SRARI_H2(RTYPE, in0, in1, shift); \ 1412 SRARI_H2(RTYPE, in2, in3, shift); \ 1413 } 1414 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) 1415 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) 1416 1417 #define SRARI_W2(RTYPE, in0, in1, shift) \ 1418 { \ 1419 in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ 1420 in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ 1421 } 1422 1423 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ 1424 { \ 1425 SRARI_W2(RTYPE, in0, in1, shift); \ 1426 SRARI_W2(RTYPE, in2, in3, shift); \ 1427 } 1428 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) 1429 1430 /* Description : Multiplication of pairs of vectors 1431 Arguments : Inputs - in0, in1, in2, in3 1432 Outputs - out0, out1 1433 Details : Each element from 'in0' is multiplied with elements from 'in1' 1434 and the result is written to 'out0' 1435 */ 1436 #define MUL2(in0, in1, in2, in3, out0, out1) \ 1437 { \ 1438 out0 = in0 * in1; \ 1439 out1 = in2 * in3; \ 1440 } 1441 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1442 { \ 1443 MUL2(in0, in1, in2, in3, out0, out1); \ 1444 MUL2(in4, in5, in6, in7, out2, out3); \ 1445 } 1446 1447 /* Description : Addition of 2 pairs of vectors 1448 Arguments : Inputs - in0, in1, in2, in3 1449 Outputs - out0, out1 1450 Details : Each element in 'in0' is added to 'in1' and result is written 1451 to 'out0'. 1452 */ 1453 #define ADD2(in0, in1, in2, in3, out0, out1) \ 1454 { \ 1455 out0 = in0 + in1; \ 1456 out1 = in2 + in3; \ 1457 } 1458 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1459 { \ 1460 ADD2(in0, in1, in2, in3, out0, out1); \ 1461 ADD2(in4, in5, in6, in7, out2, out3); \ 1462 } 1463 1464 /* Description : Subtraction of 2 pairs of vectors 1465 Arguments : Inputs - in0, in1, in2, in3 1466 Outputs - out0, out1 1467 Details : Each element in 'in1' is subtracted from 'in0' and result is 1468 written to 'out0'. 1469 */ 1470 #define SUB2(in0, in1, in2, in3, out0, out1) \ 1471 { \ 1472 out0 = in0 - in1; \ 1473 out1 = in2 - in3; \ 1474 } 1475 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1476 { \ 1477 out0 = in0 - in1; \ 1478 out1 = in2 - in3; \ 1479 out2 = in4 - in5; \ 1480 out3 = in6 - in7; \ 1481 } 1482 1483 /* Description : Sign extend halfword elements from right half of the vector 1484 Arguments : Input - in (halfword vector) 1485 Output - out (sign extended word vector) 1486 Return Type - signed word 1487 Details : Sign bit of halfword elements from input vector 'in' is 1488 extracted and interleaved with same vector 'in0' to generate 1489 4 word elements keeping sign intact 1490 */ 1491 #define UNPCK_R_SH_SW(in, out) \ 1492 { \ 1493 v8i16 sign_m; \ 1494 \ 1495 sign_m = __msa_clti_s_h((v8i16)in, 0); \ 1496 out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ 1497 } 1498 1499 /* Description : Zero extend unsigned byte elements to halfword elements 1500 Arguments : Input - in (unsigned byte vector) 1501 Outputs - out0, out1 (unsigned halfword vectors) 1502 Return Type - signed halfword 1503 Details : Zero extended right half of vector is returned in 'out0' 1504 Zero extended left half of vector is returned in 'out1' 1505 */ 1506 #define UNPCK_UB_SH(in, out0, out1) \ 1507 { \ 1508 v16i8 zero_m = { 0 }; \ 1509 \ 1510 ILVRL_B2_SH(zero_m, in, out0, out1); \ 1511 } 1512 1513 /* Description : Sign extend halfword elements from input vector and return 1514 the result in pair of vectors 1515 Arguments : Input - in (halfword vector) 1516 Outputs - out0, out1 (sign extended word vectors) 1517 Return Type - signed word 1518 Details : Sign bit of halfword elements from input vector 'in' is 1519 extracted and interleaved right with same vector 'in0' to 1520 generate 4 signed word elements in 'out0' 1521 Then interleaved left with same vector 'in0' to 1522 generate 4 signed word elements in 'out1' 1523 */ 1524 #define UNPCK_SH_SW(in, out0, out1) \ 1525 { \ 1526 v8i16 tmp_m; \ 1527 \ 1528 tmp_m = __msa_clti_s_h((v8i16)in, 0); \ 1529 ILVRL_H2_SW(tmp_m, in, out0, out1); \ 1530 } 1531 1532 /* Description : Butterfly of 4 input vectors 1533 Arguments : Inputs - in0, in1, in2, in3 1534 Outputs - out0, out1, out2, out3 1535 Details : Butterfly operation 1536 */ 1537 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ 1538 { \ 1539 out0 = in0 + in3; \ 1540 out1 = in1 + in2; \ 1541 \ 1542 out2 = in1 - in2; \ 1543 out3 = in0 - in3; \ 1544 } 1545 1546 /* Description : Transpose input 8x8 byte block 1547 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1548 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1549 Return Type - as per RTYPE 1550 */ 1551 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ 1552 out1, out2, out3, out4, out5, out6, out7) \ 1553 { \ 1554 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1555 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1556 \ 1557 ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \ 1558 tmp3_m); \ 1559 ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ 1560 ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ 1561 ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ 1562 ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ 1563 SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ 1564 SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ 1565 } 1566 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) 1567 1568 /* Description : Transpose 16x4 block into 4x16 with byte elements in vectors 1569 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 1570 in8, in9, in10, in11, in12, in13, in14, in15 1571 Outputs - out0, out1, out2, out3 1572 Return Type - unsigned byte 1573 */ 1574 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ 1575 in10, in11, in12, in13, in14, in15, out0, out1, \ 1576 out2, out3) \ 1577 { \ 1578 v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1579 \ 1580 ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \ 1581 out1 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m); \ 1582 \ 1583 ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \ 1584 out3 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m); \ 1585 \ 1586 ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \ 1587 \ 1588 tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \ 1589 ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \ 1590 \ 1591 tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \ 1592 ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \ 1593 out0 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ 1594 out2 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ 1595 \ 1596 tmp0_m = (v2i64)__msa_ilvod_b((v16i8)out3, (v16i8)out1); \ 1597 tmp1_m = (v2i64)__msa_ilvod_b((v16i8)tmp3_m, (v16i8)tmp2_m); \ 1598 out1 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ 1599 out3 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ 1600 } 1601 1602 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors 1603 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 1604 in8, in9, in10, in11, in12, in13, in14, in15 1605 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1606 Return Type - unsigned byte 1607 */ 1608 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ 1609 in10, in11, in12, in13, in14, in15, out0, out1, \ 1610 out2, out3, out4, out5, out6, out7) \ 1611 { \ 1612 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1613 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1614 \ 1615 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ 1616 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ 1617 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ 1618 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ 1619 \ 1620 tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ 1621 tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ 1622 tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ 1623 tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ 1624 out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ 1625 tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ 1626 out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ 1627 tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ 1628 \ 1629 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ 1630 out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1631 out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1632 \ 1633 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ 1634 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ 1635 out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1636 out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1637 \ 1638 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ 1639 out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1640 out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1641 \ 1642 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ 1643 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ 1644 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ 1645 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ 1646 out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1647 out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1648 } 1649 1650 /* Description : Transpose 4x4 block with half word elements in vectors 1651 Arguments : Inputs - in0, in1, in2, in3 1652 Outputs - out0, out1, out2, out3 1653 Return Type - signed halfword 1654 */ 1655 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ 1656 { \ 1657 v8i16 s0_m, s1_m; \ 1658 \ 1659 ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ 1660 ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ 1661 out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ 1662 out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ 1663 } 1664 1665 /* Description : Transpose 8x4 block with half word elements in vectors 1666 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1667 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1668 Return Type - signed halfword 1669 */ 1670 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ 1671 { \ 1672 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1673 \ 1674 ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ 1675 ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ 1676 ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ 1677 ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ 1678 } 1679 1680 /* Description : Transpose 4x4 block with word elements in vectors 1681 Arguments : Inputs - in0, in1, in2, in3 1682 Outputs - out0, out1, out2, out3 1683 Return Type - signed word 1684 */ 1685 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ 1686 { \ 1687 v4i32 s0_m, s1_m, s2_m, s3_m; \ 1688 \ 1689 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ 1690 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ 1691 \ 1692 out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ 1693 out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ 1694 out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ 1695 out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ 1696 } 1697 1698 /* Description : Dot product and addition of 3 signed halfword input vectors 1699 Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2 1700 Output - out0_m 1701 Return Type - signed halfword 1702 Details : Dot product of 'in0' with 'coeff0' 1703 Dot product of 'in1' with 'coeff1' 1704 Dot product of 'in2' with 'coeff2' 1705 Addition of all the 3 vector results 1706 out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2) 1707 */ 1708 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \ 1709 ({ \ 1710 v8i16 tmp1_m; \ 1711 v8i16 out0_m; \ 1712 \ 1713 out0_m = __msa_dotp_s_h((v16i8)in0, (v16i8)coeff0); \ 1714 out0_m = __msa_dpadd_s_h(out0_m, (v16i8)in1, (v16i8)coeff1); \ 1715 tmp1_m = __msa_dotp_s_h((v16i8)in2, (v16i8)coeff2); \ 1716 out0_m = __msa_adds_s_h(out0_m, tmp1_m); \ 1717 \ 1718 out0_m; \ 1719 }) 1720 1721 /* Description : Pack even elements of input vectors & xor with 128 1722 Arguments : Inputs - in0, in1 1723 Output - out_m 1724 Return Type - unsigned byte 1725 Details : Signed byte even elements from 'in0' and 'in1' are packed 1726 together in one vector and the resulting vector is xor'ed with 1727 128 to shift the range from signed to unsigned byte 1728 */ 1729 #define PCKEV_XORI128_UB(in0, in1) \ 1730 ({ \ 1731 v16u8 out_m; \ 1732 out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ 1733 out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ 1734 out_m; \ 1735 }) 1736 1737 /* Description : Pack even byte elements and store byte vector in destination 1738 memory 1739 Arguments : Inputs - in0, in1, pdst 1740 */ 1741 #define PCKEV_ST_SB(in0, in1, pdst) \ 1742 { \ 1743 v16i8 tmp_m; \ 1744 tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ 1745 ST_SB(tmp_m, (pdst)); \ 1746 } 1747 1748 /* Description : Horizontal 2 tap filter kernel code 1749 Arguments : Inputs - in0, in1, mask, coeff, shift 1750 */ 1751 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ 1752 ({ \ 1753 v16i8 tmp0_m; \ 1754 v8u16 tmp1_m; \ 1755 \ 1756 tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ 1757 tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ 1758 tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ 1759 \ 1760 tmp1_m; \ 1761 }) 1762 #endif // VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ 1763