1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_DSP_MIPS_MACROS_MSA_H_ 12 #define VPX_DSP_MIPS_MACROS_MSA_H_ 13 14 #include <msa.h> 15 16 #include "./vpx_config.h" 17 #include "vpx/vpx_integer.h" 18 19 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) 20 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__) 21 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__) 22 23 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) 24 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__) 25 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__) 26 27 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) 28 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__) 29 30 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 31 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) 32 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__) 33 34 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 35 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__) 36 37 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 38 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__) 39 40 #if (__mips_isa_rev >= 6) 41 #define LH(psrc) ({ \ 42 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 43 uint16_t val_m; \ 44 \ 45 __asm__ __volatile__ ( \ 46 "lh %[val_m], %[psrc_m] \n\t" \ 47 \ 48 : [val_m] "=r" (val_m) \ 49 : [psrc_m] "m" (*psrc_m) \ 50 ); \ 51 \ 52 val_m; \ 53 }) 54 55 #define LW(psrc) ({ \ 56 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 57 uint32_t val_m; \ 58 \ 59 __asm__ __volatile__ ( \ 60 "lw %[val_m], %[psrc_m] \n\t" \ 61 \ 62 : [val_m] "=r" (val_m) \ 63 : [psrc_m] "m" (*psrc_m) \ 64 ); \ 65 \ 66 val_m; \ 67 }) 68 69 #if (__mips == 64) 70 #define LD(psrc) ({ \ 71 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 72 uint64_t val_m = 0; \ 73 \ 74 __asm__ __volatile__ ( \ 75 "ld %[val_m], %[psrc_m] \n\t" \ 76 \ 77 : [val_m] "=r" (val_m) \ 78 : [psrc_m] "m" (*psrc_m) \ 79 ); \ 80 \ 81 val_m; \ 82 }) 83 #else // !(__mips == 64) 84 #define LD(psrc) ({ \ 85 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 86 uint32_t val0_m, val1_m; \ 87 uint64_t val_m = 0; \ 88 \ 89 val0_m = LW(psrc_m); \ 90 val1_m = LW(psrc_m + 4); \ 91 \ 92 val_m = (uint64_t)(val1_m); \ 93 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ 94 val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ 95 \ 96 val_m; \ 97 }) 98 #endif // (__mips == 64) 99 100 #define SH(val, pdst) { \ 101 uint8_t *pdst_m = (uint8_t *)(pdst); \ 102 const uint16_t val_m = (val); \ 103 \ 104 __asm__ __volatile__ ( \ 105 "sh %[val_m], %[pdst_m] \n\t" \ 106 \ 107 : [pdst_m] "=m" (*pdst_m) \ 108 : [val_m] "r" (val_m) \ 109 ); \ 110 } 111 112 #define SW(val, pdst) { \ 113 uint8_t *pdst_m = (uint8_t *)(pdst); \ 114 const uint32_t val_m = (val); \ 115 \ 116 __asm__ __volatile__ ( \ 117 "sw %[val_m], %[pdst_m] \n\t" \ 118 \ 119 : [pdst_m] "=m" (*pdst_m) \ 120 : [val_m] "r" (val_m) \ 121 ); \ 122 } 123 124 #define SD(val, pdst) { \ 125 uint8_t *pdst_m = (uint8_t *)(pdst); \ 126 const uint64_t val_m = (val); \ 127 \ 128 __asm__ __volatile__ ( \ 129 "sd %[val_m], %[pdst_m] \n\t" \ 130 \ 131 : [pdst_m] "=m" (*pdst_m) \ 132 : [val_m] "r" (val_m) \ 133 ); \ 134 } 135 #else // !(__mips_isa_rev >= 6) 136 #define LH(psrc) ({ \ 137 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 138 uint16_t val_m; \ 139 \ 140 __asm__ __volatile__ ( \ 141 "ulh %[val_m], %[psrc_m] \n\t" \ 142 \ 143 : [val_m] "=r" (val_m) \ 144 : [psrc_m] "m" (*psrc_m) \ 145 ); \ 146 \ 147 val_m; \ 148 }) 149 150 #define LW(psrc) ({ \ 151 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 152 uint32_t val_m; \ 153 \ 154 __asm__ __volatile__ ( \ 155 "ulw %[val_m], %[psrc_m] \n\t" \ 156 \ 157 : [val_m] "=r" (val_m) \ 158 : [psrc_m] "m" (*psrc_m) \ 159 ); \ 160 \ 161 val_m; \ 162 }) 163 164 #if (__mips == 64) 165 #define LD(psrc) ({ \ 166 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 167 uint64_t val_m = 0; \ 168 \ 169 __asm__ __volatile__ ( \ 170 "uld %[val_m], %[psrc_m] \n\t" \ 171 \ 172 : [val_m] "=r" (val_m) \ 173 : [psrc_m] "m" (*psrc_m) \ 174 ); \ 175 \ 176 val_m; \ 177 }) 178 #else // !(__mips == 64) 179 #define LD(psrc) ({ \ 180 const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ 181 uint32_t val0_m, val1_m; \ 182 uint64_t val_m = 0; \ 183 \ 184 val0_m = LW(psrc_m1); \ 185 val1_m = LW(psrc_m1 + 4); \ 186 \ 187 val_m = (uint64_t)(val1_m); \ 188 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ 189 val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ 190 \ 191 val_m; \ 192 }) 193 #endif // (__mips == 64) 194 195 #define SH(val, pdst) { \ 196 uint8_t *pdst_m = (uint8_t *)(pdst); \ 197 const uint16_t val_m = (val); \ 198 \ 199 __asm__ __volatile__ ( \ 200 "ush %[val_m], %[pdst_m] \n\t" \ 201 \ 202 : [pdst_m] "=m" (*pdst_m) \ 203 : [val_m] "r" (val_m) \ 204 ); \ 205 } 206 207 #define SW(val, pdst) { \ 208 uint8_t *pdst_m = (uint8_t *)(pdst); \ 209 const uint32_t val_m = (val); \ 210 \ 211 __asm__ __volatile__ ( \ 212 "usw %[val_m], %[pdst_m] \n\t" \ 213 \ 214 : [pdst_m] "=m" (*pdst_m) \ 215 : [val_m] "r" (val_m) \ 216 ); \ 217 } 218 219 #define SD(val, pdst) { \ 220 uint8_t *pdst_m1 = (uint8_t *)(pdst); \ 221 uint32_t val0_m, val1_m; \ 222 \ 223 val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \ 224 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ 225 \ 226 SW(val0_m, pdst_m1); \ 227 SW(val1_m, pdst_m1 + 4); \ 228 } 229 #endif // (__mips_isa_rev >= 6) 230 231 /* Description : Load 4 words with stride 232 Arguments : Inputs - psrc, stride 233 Outputs - out0, out1, out2, out3 234 Details : Load word in 'out0' from (psrc) 235 Load word in 'out1' from (psrc + stride) 236 Load word in 'out2' from (psrc + 2 * stride) 237 Load word in 'out3' from (psrc + 3 * stride) 238 */ 239 #define LW4(psrc, stride, out0, out1, out2, out3) { \ 240 out0 = LW((psrc)); \ 241 out1 = LW((psrc) + stride); \ 242 out2 = LW((psrc) + 2 * stride); \ 243 out3 = LW((psrc) + 3 * stride); \ 244 } 245 246 /* Description : Load double words with stride 247 Arguments : Inputs - psrc, stride 248 Outputs - out0, out1 249 Details : Load double word in 'out0' from (psrc) 250 Load double word in 'out1' from (psrc + stride) 251 */ 252 #define LD2(psrc, stride, out0, out1) { \ 253 out0 = LD((psrc)); \ 254 out1 = LD((psrc) + stride); \ 255 } 256 #define LD4(psrc, stride, out0, out1, out2, out3) { \ 257 LD2((psrc), stride, out0, out1); \ 258 LD2((psrc) + 2 * stride, stride, out2, out3); \ 259 } 260 261 /* Description : Store 4 words with stride 262 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 263 Details : Store word from 'in0' to (pdst) 264 Store word from 'in1' to (pdst + stride) 265 Store word from 'in2' to (pdst + 2 * stride) 266 Store word from 'in3' to (pdst + 3 * stride) 267 */ 268 #define SW4(in0, in1, in2, in3, pdst, stride) { \ 269 SW(in0, (pdst)) \ 270 SW(in1, (pdst) + stride); \ 271 SW(in2, (pdst) + 2 * stride); \ 272 SW(in3, (pdst) + 3 * stride); \ 273 } 274 275 /* Description : Store 4 double words with stride 276 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 277 Details : Store double word from 'in0' to (pdst) 278 Store double word from 'in1' to (pdst + stride) 279 Store double word from 'in2' to (pdst + 2 * stride) 280 Store double word from 'in3' to (pdst + 3 * stride) 281 */ 282 #define SD4(in0, in1, in2, in3, pdst, stride) { \ 283 SD(in0, (pdst)) \ 284 SD(in1, (pdst) + stride); \ 285 SD(in2, (pdst) + 2 * stride); \ 286 SD(in3, (pdst) + 3 * stride); \ 287 } 288 289 /* Description : Load vectors with 16 byte elements with stride 290 Arguments : Inputs - psrc, stride 291 Outputs - out0, out1 292 Return Type - as per RTYPE 293 Details : Load 16 byte elements in 'out0' from (psrc) 294 Load 16 byte elements in 'out1' from (psrc + stride) 295 */ 296 #define LD_B2(RTYPE, psrc, stride, out0, out1) { \ 297 out0 = LD_B(RTYPE, (psrc)); \ 298 out1 = LD_B(RTYPE, (psrc) + stride); \ 299 } 300 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) 301 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) 302 303 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) { \ 304 LD_B2(RTYPE, (psrc), stride, out0, out1); \ 305 out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ 306 } 307 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) 308 309 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ 310 LD_B2(RTYPE, (psrc), stride, out0, out1); \ 311 LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ 312 } 313 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) 314 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) 315 316 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) { \ 317 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 318 out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ 319 } 320 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) 321 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) 322 323 #define LD_B7(RTYPE, psrc, stride, \ 324 out0, out1, out2, out3, out4, out5, out6) { \ 325 LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ 326 LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ 327 } 328 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) 329 330 #define LD_B8(RTYPE, psrc, stride, \ 331 out0, out1, out2, out3, out4, out5, out6, out7) { \ 332 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 333 LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ 334 } 335 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) 336 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) 337 338 /* Description : Load vectors with 8 halfword elements with stride 339 Arguments : Inputs - psrc, stride 340 Outputs - out0, out1 341 Details : Load 8 halfword elements in 'out0' from (psrc) 342 Load 8 halfword elements in 'out1' from (psrc + stride) 343 */ 344 #define LD_H2(RTYPE, psrc, stride, out0, out1) { \ 345 out0 = LD_H(RTYPE, (psrc)); \ 346 out1 = LD_H(RTYPE, (psrc) + (stride)); \ 347 } 348 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) 349 350 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ 351 LD_H2(RTYPE, (psrc), stride, out0, out1); \ 352 LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ 353 } 354 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) 355 356 #define LD_H8(RTYPE, psrc, stride, \ 357 out0, out1, out2, out3, out4, out5, out6, out7) { \ 358 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 359 LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ 360 } 361 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) 362 363 #define LD_H16(RTYPE, psrc, stride, \ 364 out0, out1, out2, out3, out4, out5, out6, out7, \ 365 out8, out9, out10, out11, out12, out13, out14, out15) { \ 366 LD_H8(RTYPE, (psrc), stride, \ 367 out0, out1, out2, out3, out4, out5, out6, out7); \ 368 LD_H8(RTYPE, (psrc) + 8 * stride, stride, \ 369 out8, out9, out10, out11, out12, out13, out14, out15); \ 370 } 371 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) 372 373 /* Description : Load 4x4 block of signed halfword elements from 1D source 374 data into 4 vectors (Each vector with 4 signed halfwords) 375 Arguments : Input - psrc 376 Outputs - out0, out1, out2, out3 377 */ 378 #define LD4x4_SH(psrc, out0, out1, out2, out3) { \ 379 out0 = LD_SH(psrc); \ 380 out2 = LD_SH(psrc + 8); \ 381 out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ 382 out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ 383 } 384 385 /* Description : Load 2 vectors of signed word elements with stride 386 Arguments : Inputs - psrc, stride 387 Outputs - out0, out1 388 Return Type - signed word 389 */ 390 #define LD_SW2(psrc, stride, out0, out1) { \ 391 out0 = LD_SW((psrc)); \ 392 out1 = LD_SW((psrc) + stride); \ 393 } 394 395 /* Description : Store vectors of 16 byte elements with stride 396 Arguments : Inputs - in0, in1, pdst, stride 397 Details : Store 16 byte elements from 'in0' to (pdst) 398 Store 16 byte elements from 'in1' to (pdst + stride) 399 */ 400 #define ST_B2(RTYPE, in0, in1, pdst, stride) { \ 401 ST_B(RTYPE, in0, (pdst)); \ 402 ST_B(RTYPE, in1, (pdst) + stride); \ 403 } 404 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) 405 406 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) { \ 407 ST_B2(RTYPE, in0, in1, (pdst), stride); \ 408 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 409 } 410 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) 411 412 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 413 pdst, stride) { \ 414 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ 415 ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ 416 } 417 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) 418 419 /* Description : Store vectors of 8 halfword elements with stride 420 Arguments : Inputs - in0, in1, pdst, stride 421 Details : Store 8 halfword elements from 'in0' to (pdst) 422 Store 8 halfword elements from 'in1' to (pdst + stride) 423 */ 424 #define ST_H2(RTYPE, in0, in1, pdst, stride) { \ 425 ST_H(RTYPE, in0, (pdst)); \ 426 ST_H(RTYPE, in1, (pdst) + stride); \ 427 } 428 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) 429 430 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) { \ 431 ST_H2(RTYPE, in0, in1, (pdst), stride); \ 432 ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 433 } 434 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) 435 436 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) { \ 437 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ 438 ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ 439 } 440 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) 441 442 /* Description : Store vectors of word elements with stride 443 Arguments : Inputs - in0, in1, pdst, stride 444 Details : Store 4 word elements from 'in0' to (pdst) 445 Store 4 word elements from 'in1' to (pdst + stride) 446 */ 447 #define ST_SW2(in0, in1, pdst, stride) { \ 448 ST_SW(in0, (pdst)); \ 449 ST_SW(in1, (pdst) + stride); \ 450 } 451 452 /* Description : Store 2x4 byte block to destination memory from input vector 453 Arguments : Inputs - in, stidx, pdst, stride 454 Details : Index 'stidx' halfword element from 'in' vector is copied to 455 the GP register and stored to (pdst) 456 Index 'stidx+1' halfword element from 'in' vector is copied to 457 the GP register and stored to (pdst + stride) 458 Index 'stidx+2' halfword element from 'in' vector is copied to 459 the GP register and stored to (pdst + 2 * stride) 460 Index 'stidx+3' halfword element from 'in' vector is copied to 461 the GP register and stored to (pdst + 3 * stride) 462 */ 463 #define ST2x4_UB(in, stidx, pdst, stride) { \ 464 uint16_t out0_m, out1_m, out2_m, out3_m; \ 465 uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ 466 \ 467 out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ 468 out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ 469 out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ 470 out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ 471 \ 472 SH(out0_m, pblk_2x4_m); \ 473 SH(out1_m, pblk_2x4_m + stride); \ 474 SH(out2_m, pblk_2x4_m + 2 * stride); \ 475 SH(out3_m, pblk_2x4_m + 3 * stride); \ 476 } 477 478 /* Description : Store 4x2 byte block to destination memory from input vector 479 Arguments : Inputs - in, pdst, stride 480 Details : Index 0 word element from 'in' vector is copied to the GP 481 register and stored to (pdst) 482 Index 1 word element from 'in' vector is copied to the GP 483 register and stored to (pdst + stride) 484 */ 485 #define ST4x2_UB(in, pdst, stride) { \ 486 uint32_t out0_m, out1_m; \ 487 uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ 488 \ 489 out0_m = __msa_copy_u_w((v4i32)in, 0); \ 490 out1_m = __msa_copy_u_w((v4i32)in, 1); \ 491 \ 492 SW(out0_m, pblk_4x2_m); \ 493 SW(out1_m, pblk_4x2_m + stride); \ 494 } 495 496 /* Description : Store 4x4 byte block to destination memory from input vector 497 Arguments : Inputs - in0, in1, pdst, stride 498 Details : 'Idx0' word element from input vector 'in0' is copied to the 499 GP register and stored to (pdst) 500 'Idx1' word element from input vector 'in0' is copied to the 501 GP register and stored to (pdst + stride) 502 'Idx2' word element from input vector 'in0' is copied to the 503 GP register and stored to (pdst + 2 * stride) 504 'Idx3' word element from input vector 'in0' is copied to the 505 GP register and stored to (pdst + 3 * stride) 506 */ 507 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) { \ 508 uint32_t out0_m, out1_m, out2_m, out3_m; \ 509 uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ 510 \ 511 out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ 512 out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ 513 out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ 514 out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ 515 \ 516 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ 517 } 518 #define ST4x8_UB(in0, in1, pdst, stride) { \ 519 uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ 520 \ 521 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ 522 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ 523 } 524 525 /* Description : Store 8x1 byte block to destination memory from input vector 526 Arguments : Inputs - in, pdst 527 Details : Index 0 double word element from 'in' vector is copied to the 528 GP register and stored to (pdst) 529 */ 530 #define ST8x1_UB(in, pdst) { \ 531 uint64_t out0_m; \ 532 \ 533 out0_m = __msa_copy_u_d((v2i64)in, 0); \ 534 SD(out0_m, pdst); \ 535 } 536 537 /* Description : Store 8x2 byte block to destination memory from input vector 538 Arguments : Inputs - in, pdst, stride 539 Details : Index 0 double word element from 'in' vector is copied to the 540 GP register and stored to (pdst) 541 Index 1 double word element from 'in' vector is copied to the 542 GP register and stored to (pdst + stride) 543 */ 544 #define ST8x2_UB(in, pdst, stride) { \ 545 uint64_t out0_m, out1_m; \ 546 uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ 547 \ 548 out0_m = __msa_copy_u_d((v2i64)in, 0); \ 549 out1_m = __msa_copy_u_d((v2i64)in, 1); \ 550 \ 551 SD(out0_m, pblk_8x2_m); \ 552 SD(out1_m, pblk_8x2_m + stride); \ 553 } 554 555 /* Description : Store 8x4 byte block to destination memory from input 556 vectors 557 Arguments : Inputs - in0, in1, pdst, stride 558 Details : Index 0 double word element from 'in0' vector is copied to the 559 GP register and stored to (pdst) 560 Index 1 double word element from 'in0' vector is copied to the 561 GP register and stored to (pdst + stride) 562 Index 0 double word element from 'in1' vector is copied to the 563 GP register and stored to (pdst + 2 * stride) 564 Index 1 double word element from 'in1' vector is copied to the 565 GP register and stored to (pdst + 3 * stride) 566 */ 567 #define ST8x4_UB(in0, in1, pdst, stride) { \ 568 uint64_t out0_m, out1_m, out2_m, out3_m; \ 569 uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ 570 \ 571 out0_m = __msa_copy_u_d((v2i64)in0, 0); \ 572 out1_m = __msa_copy_u_d((v2i64)in0, 1); \ 573 out2_m = __msa_copy_u_d((v2i64)in1, 0); \ 574 out3_m = __msa_copy_u_d((v2i64)in1, 1); \ 575 \ 576 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ 577 } 578 579 /* Description : average with rounding (in0 + in1 + 1) / 2. 580 Arguments : Inputs - in0, in1, in2, in3, 581 Outputs - out0, out1 582 Return Type - as per RTYPE 583 Details : Each unsigned byte element from 'in0' vector is added with 584 each unsigned byte element from 'in1' vector. Then the average 585 with rounding is calculated and written to 'out0' 586 */ 587 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) { \ 588 out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ 589 out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ 590 } 591 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) 592 593 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 594 out0, out1, out2, out3) { \ 595 AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ 596 AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ 597 } 598 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) 599 600 /* Description : Immediate number of elements to slide with zero 601 Arguments : Inputs - in0, in1, slide_val 602 Outputs - out0, out1 603 Return Type - as per RTYPE 604 Details : Byte elements from 'zero_m' vector are slid into 'in0' by 605 value specified in the 'slide_val' 606 */ 607 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) { \ 608 v16i8 zero_m = { 0 }; \ 609 out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ 610 out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ 611 } 612 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) 613 614 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \ 615 out0, out1, out2, out3, slide_val) { \ 616 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ 617 SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ 618 } 619 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) 620 621 /* Description : Immediate number of elements to slide 622 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val 623 Outputs - out0, out1 624 Return Type - as per RTYPE 625 Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by 626 value specified in the 'slide_val' 627 */ 628 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) { \ 629 out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ 630 out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ 631 } 632 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) 633 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) 634 635 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \ 636 out0, out1, out2, slide_val) { \ 637 SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ 638 out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ 639 } 640 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) 641 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) 642 643 /* Description : Shuffle byte vector elements as per mask vector 644 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 645 Outputs - out0, out1 646 Return Type - as per RTYPE 647 Details : Byte elements from 'in0' & 'in1' are copied selectively to 648 'out0' as per control vector 'mask0' 649 */ 650 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \ 651 out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ 652 out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ 653 } 654 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) 655 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) 656 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) 657 658 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \ 659 out0, out1, out2, out3) { \ 660 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ 661 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ 662 } 663 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) 664 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) 665 666 /* Description : Dot product of byte vector elements 667 Arguments : Inputs - mult0, mult1, cnst0, cnst1 668 Outputs - out0, out1 669 Return Type - as per RTYPE 670 Details : Unsigned byte elements from 'mult0' are multiplied with 671 unsigned byte elements from 'cnst0' producing a result 672 twice the size of input i.e. unsigned halfword. 673 The multiplication result of adjacent odd-even elements 674 are added together and written to the 'out0' vector 675 */ 676 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ 677 out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ 678 out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ 679 } 680 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) 681 682 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \ 683 cnst0, cnst1, cnst2, cnst3, \ 684 out0, out1, out2, out3) { \ 685 DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 686 DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 687 } 688 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) 689 690 /* Description : Dot product of byte vector elements 691 Arguments : Inputs - mult0, mult1, cnst0, cnst1 692 Outputs - out0, out1 693 Return Type - as per RTYPE 694 Details : Signed byte elements from 'mult0' are multiplied with 695 signed byte elements from 'cnst0' producing a result 696 twice the size of input i.e. signed halfword. 697 The multiplication result of adjacent odd-even elements 698 are added together and written to the 'out0' vector 699 */ 700 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ 701 out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ 702 out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ 703 } 704 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) 705 706 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \ 707 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \ 708 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 709 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 710 } 711 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) 712 713 /* Description : Dot product of halfword vector elements 714 Arguments : Inputs - mult0, mult1, cnst0, cnst1 715 Outputs - out0, out1 716 Return Type - as per RTYPE 717 Details : Signed halfword elements from 'mult0' are multiplied with 718 signed halfword elements from 'cnst0' producing a result 719 twice the size of input i.e. signed word. 720 The multiplication result of adjacent odd-even elements 721 are added together and written to the 'out0' vector 722 */ 723 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ 724 out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ 725 out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ 726 } 727 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) 728 729 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \ 730 cnst0, cnst1, cnst2, cnst3, \ 731 out0, out1, out2, out3) { \ 732 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 733 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 734 } 735 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) 736 737 /* Description : Dot product of word vector elements 738 Arguments : Inputs - mult0, mult1, cnst0, cnst1 739 Outputs - out0, out1 740 Return Type - as per RTYPE 741 Details : Signed word elements from 'mult0' are multiplied with 742 signed word elements from 'cnst0' producing a result 743 twice the size of input i.e. signed double word. 744 The multiplication result of adjacent odd-even elements 745 are added together and written to the 'out0' vector 746 */ 747 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ 748 out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ 749 out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ 750 } 751 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) 752 753 /* Description : Dot product & addition of byte vector elements 754 Arguments : Inputs - mult0, mult1, cnst0, cnst1 755 Outputs - out0, out1 756 Return Type - as per RTYPE 757 Details : Signed byte elements from 'mult0' are multiplied with 758 signed byte elements from 'cnst0' producing a result 759 twice the size of input i.e. signed halfword. 760 The multiplication result of adjacent odd-even elements 761 are added to the 'out0' vector 762 */ 763 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ 764 out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ 765 out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ 766 } 767 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) 768 769 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \ 770 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \ 771 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 772 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 773 } 774 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) 775 776 /* Description : Dot product & addition of halfword vector elements 777 Arguments : Inputs - mult0, mult1, cnst0, cnst1 778 Outputs - out0, out1 779 Return Type - as per RTYPE 780 Details : Signed halfword elements from 'mult0' are multiplied with 781 signed halfword elements from 'cnst0' producing a result 782 twice the size of input i.e. signed word. 783 The multiplication result of adjacent odd-even elements 784 are added to the 'out0' vector 785 */ 786 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ 787 out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ 788 out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ 789 } 790 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) 791 792 /* Description : Dot product & addition of double word vector elements 793 Arguments : Inputs - mult0, mult1 794 Outputs - out0, out1 795 Return Type - as per RTYPE 796 Details : Each signed word element from 'mult0' is multiplied with itself 797 producing an intermediate result twice the size of input 798 i.e. signed double word 799 The multiplication result of adjacent odd-even elements 800 are added to the 'out0' vector 801 */ 802 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) { \ 803 out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ 804 out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ 805 } 806 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) 807 808 /* Description : Minimum values between unsigned elements of 809 either vector are copied to the output vector 810 Arguments : Inputs - in0, in1, min_vec 811 Outputs - in place operation 812 Return Type - as per RTYPE 813 Details : Minimum of unsigned halfword element values from 'in0' and 814 'min_vec' are written to output vector 'in0' 815 */ 816 #define MIN_UH2(RTYPE, in0, in1, min_vec) { \ 817 in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ 818 in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ 819 } 820 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) 821 822 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) { \ 823 MIN_UH2(RTYPE, in0, in1, min_vec); \ 824 MIN_UH2(RTYPE, in2, in3, min_vec); \ 825 } 826 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) 827 828 /* Description : Clips all signed halfword elements of input vector 829 between 0 & 255 830 Arguments : Input - in 831 Output - out_m 832 Return Type - signed halfword 833 */ 834 #define CLIP_SH_0_255(in) ({ \ 835 v8i16 max_m = __msa_ldi_h(255); \ 836 v8i16 out_m; \ 837 \ 838 out_m = __msa_maxi_s_h((v8i16)in, 0); \ 839 out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ 840 out_m; \ 841 }) 842 #define CLIP_SH2_0_255(in0, in1) { \ 843 in0 = CLIP_SH_0_255(in0); \ 844 in1 = CLIP_SH_0_255(in1); \ 845 } 846 #define CLIP_SH4_0_255(in0, in1, in2, in3) { \ 847 CLIP_SH2_0_255(in0, in1); \ 848 CLIP_SH2_0_255(in2, in3); \ 849 } 850 851 /* Description : Horizontal addition of 4 signed word elements of input vector 852 Arguments : Input - in (signed word vector) 853 Output - sum_m (i32 sum) 854 Return Type - signed word (GP) 855 Details : 4 signed word elements of 'in' vector are added together and 856 the resulting integer sum is returned 857 */ 858 #define HADD_SW_S32(in) ({ \ 859 v2i64 res0_m, res1_m; \ 860 int32_t sum_m; \ 861 \ 862 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ 863 res1_m = __msa_splati_d(res0_m, 1); \ 864 res0_m = res0_m + res1_m; \ 865 sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ 866 sum_m; \ 867 }) 868 869 /* Description : Horizontal addition of 8 unsigned halfword elements 870 Arguments : Inputs - in (unsigned halfword vector) 871 Outputs - sum_m (u32 sum) 872 Return Type - unsigned word 873 Details : 8 unsigned halfword elements of input vector are added 874 together and the resulting integer sum is returned 875 */ 876 #define HADD_UH_U32(in) ({ \ 877 v4u32 res_m; \ 878 v2u64 res0_m, res1_m; \ 879 uint32_t sum_m; \ 880 \ 881 res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ 882 res0_m = __msa_hadd_u_d(res_m, res_m); \ 883 res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ 884 res0_m = res0_m + res1_m; \ 885 sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ 886 sum_m; \ 887 }) 888 889 /* Description : Horizontal addition of unsigned byte vector elements 890 Arguments : Inputs - in0, in1 891 Outputs - out0, out1 892 Return Type - as per RTYPE 893 Details : Each unsigned odd byte element from 'in0' is added to 894 even unsigned byte element from 'in0' (pairwise) and the 895 halfword result is written to 'out0' 896 */ 897 #define HADD_UB2(RTYPE, in0, in1, out0, out1) { \ 898 out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ 899 out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ 900 } 901 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) 902 903 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) { \ 904 HADD_UB2(RTYPE, in0, in1, out0, out1); \ 905 HADD_UB2(RTYPE, in2, in3, out2, out3); \ 906 } 907 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) 908 909 /* Description : Horizontal subtraction of unsigned byte vector elements 910 Arguments : Inputs - in0, in1 911 Outputs - out0, out1 912 Return Type - as per RTYPE 913 Details : Each unsigned odd byte element from 'in0' is subtracted from 914 even unsigned byte element from 'in0' (pairwise) and the 915 halfword result is written to 'out0' 916 */ 917 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) { \ 918 out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ 919 out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ 920 } 921 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) 922 923 /* Description : SAD (Sum of Absolute Difference) 924 Arguments : Inputs - in0, in1, ref0, ref1 925 Outputs - sad_m (halfword vector) 926 Return Type - unsigned halfword 927 Details : Absolute difference of all the byte elements from 'in0' with 928 'ref0' is calculated and preserved in 'diff0'. Then even-odd 929 pairs are added together to generate 8 halfword results. 930 */ 931 #define SAD_UB2_UH(in0, in1, ref0, ref1) ({ \ 932 v16u8 diff0_m, diff1_m; \ 933 v8u16 sad_m = { 0 }; \ 934 \ 935 diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \ 936 diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \ 937 \ 938 sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \ 939 sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \ 940 \ 941 sad_m; \ 942 }) 943 944 /* Description : Horizontal subtraction of signed halfword vector elements 945 Arguments : Inputs - in0, in1 946 Outputs - out0, out1 947 Return Type - as per RTYPE 948 Details : Each signed odd halfword element from 'in0' is subtracted from 949 even signed halfword element from 'in0' (pairwise) and the 950 word result is written to 'out0' 951 */ 952 #define HSUB_UH2(RTYPE, in0, in1, out0, out1) { \ 953 out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ 954 out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ 955 } 956 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) 957 958 /* Description : Set element n input vector to GPR value 959 Arguments : Inputs - in0, in1, in2, in3 960 Output - out 961 Return Type - as per RTYPE 962 Details : Set element 0 in vector 'out' to value specified in 'in0' 963 */ 964 #define INSERT_W2(RTYPE, in0, in1, out) { \ 965 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ 966 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ 967 } 968 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) 969 970 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) { \ 971 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ 972 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ 973 out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ 974 out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ 975 } 976 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) 977 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) 978 979 #define INSERT_D2(RTYPE, in0, in1, out) { \ 980 out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ 981 out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ 982 } 983 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) 984 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) 985 986 /* Description : Interleave even byte elements from vectors 987 Arguments : Inputs - in0, in1, in2, in3 988 Outputs - out0, out1 989 Return Type - as per RTYPE 990 Details : Even byte elements of 'in0' and 'in1' are interleaved 991 and written to 'out0' 992 */ 993 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ 994 out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ 995 out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ 996 } 997 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) 998 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) 999 1000 /* Description : Interleave even halfword elements from vectors 1001 Arguments : Inputs - in0, in1, in2, in3 1002 Outputs - out0, out1 1003 Return Type - as per RTYPE 1004 Details : Even halfword elements of 'in0' and 'in1' are interleaved 1005 and written to 'out0' 1006 */ 1007 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ 1008 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ 1009 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ 1010 } 1011 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) 1012 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) 1013 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) 1014 1015 /* Description : Interleave even word elements from vectors 1016 Arguments : Inputs - in0, in1, in2, in3 1017 Outputs - out0, out1 1018 Return Type - as per RTYPE 1019 Details : Even word elements of 'in0' and 'in1' are interleaved 1020 and written to 'out0' 1021 */ 1022 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ 1023 out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ 1024 out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ 1025 } 1026 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) 1027 1028 /* Description : Interleave even double word elements from vectors 1029 Arguments : Inputs - in0, in1, in2, in3 1030 Outputs - out0, out1 1031 Return Type - as per RTYPE 1032 Details : Even double word elements of 'in0' and 'in1' are interleaved 1033 and written to 'out0' 1034 */ 1035 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ 1036 out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ 1037 out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ 1038 } 1039 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) 1040 1041 /* Description : Interleave left half of byte elements from vectors 1042 Arguments : Inputs - in0, in1, in2, in3 1043 Outputs - out0, out1 1044 Return Type - as per RTYPE 1045 Details : Left half of byte elements of 'in0' and 'in1' are interleaved 1046 and written to 'out0'. 1047 */ 1048 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ 1049 out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 1050 out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ 1051 } 1052 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) 1053 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) 1054 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) 1055 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) 1056 1057 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1058 out0, out1, out2, out3) { \ 1059 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1060 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1061 } 1062 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) 1063 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) 1064 1065 /* Description : Interleave left half of halfword elements from vectors 1066 Arguments : Inputs - in0, in1, in2, in3 1067 Outputs - out0, out1 1068 Return Type - as per RTYPE 1069 Details : Left half of halfword elements of 'in0' and 'in1' are 1070 interleaved and written to 'out0'. 1071 */ 1072 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ 1073 out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 1074 out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ 1075 } 1076 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) 1077 1078 /* Description : Interleave left half of word elements from vectors 1079 Arguments : Inputs - in0, in1, in2, in3 1080 Outputs - out0, out1 1081 Return Type - as per RTYPE 1082 Details : Left half of word elements of 'in0' and 'in1' are interleaved 1083 and written to 'out0'. 1084 */ 1085 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ 1086 out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 1087 out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ 1088 } 1089 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) 1090 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) 1091 1092 /* Description : Interleave right half of byte elements from vectors 1093 Arguments : Inputs - in0, in1, in2, in3 1094 Outputs - out0, out1 1095 Return Type - as per RTYPE 1096 Details : Right half of byte elements of 'in0' and 'in1' are interleaved 1097 and written to out0. 1098 */ 1099 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ 1100 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 1101 out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ 1102 } 1103 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) 1104 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) 1105 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) 1106 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) 1107 1108 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1109 out0, out1, out2, out3) { \ 1110 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1111 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1112 } 1113 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) 1114 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) 1115 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) 1116 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) 1117 1118 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1119 in8, in9, in10, in11, in12, in13, in14, in15, \ 1120 out0, out1, out2, out3, out4, out5, out6, out7) { \ 1121 ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1122 out0, out1, out2, out3); \ 1123 ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \ 1124 out4, out5, out6, out7); \ 1125 } 1126 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) 1127 1128 /* Description : Interleave right half of halfword elements from vectors 1129 Arguments : Inputs - in0, in1, in2, in3 1130 Outputs - out0, out1 1131 Return Type - as per RTYPE 1132 Details : Right half of halfword elements of 'in0' and 'in1' are 1133 interleaved and written to 'out0'. 1134 */ 1135 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ 1136 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 1137 out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ 1138 } 1139 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) 1140 1141 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1142 out0, out1, out2, out3) { \ 1143 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1144 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1145 } 1146 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) 1147 1148 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ 1149 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 1150 out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ 1151 } 1152 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) 1153 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) 1154 1155 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1156 out0, out1, out2, out3) { \ 1157 ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1158 ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1159 } 1160 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) 1161 1162 /* Description : Interleave right half of double word elements from vectors 1163 Arguments : Inputs - in0, in1, in2, in3 1164 Outputs - out0, out1 1165 Return Type - as per RTYPE 1166 Details : Right half of double word elements of 'in0' and 'in1' are 1167 interleaved and written to 'out0'. 1168 */ 1169 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ 1170 out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ 1171 out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ 1172 } 1173 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) 1174 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) 1175 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) 1176 1177 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) { \ 1178 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1179 out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ 1180 } 1181 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) 1182 1183 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1184 out0, out1, out2, out3) { \ 1185 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1186 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1187 } 1188 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) 1189 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) 1190 1191 /* Description : Interleave both left and right half of input vectors 1192 Arguments : Inputs - in0, in1 1193 Outputs - out0, out1 1194 Return Type - as per RTYPE 1195 Details : Right half of byte elements from 'in0' and 'in1' are 1196 interleaved and written to 'out0' 1197 */ 1198 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \ 1199 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 1200 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 1201 } 1202 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) 1203 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) 1204 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) 1205 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) 1206 1207 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \ 1208 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 1209 out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 1210 } 1211 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) 1212 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) 1213 1214 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) { \ 1215 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 1216 out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 1217 } 1218 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) 1219 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) 1220 1221 /* Description : Saturate the halfword element values to the max 1222 unsigned value of (sat_val + 1) bits 1223 The element data width remains unchanged 1224 Arguments : Inputs - in0, in1, sat_val 1225 Outputs - in place operation 1226 Return Type - as per RTYPE 1227 Details : Each unsigned halfword element from 'in0' is saturated to the 1228 value generated with (sat_val + 1) bit range. 1229 The results are written in place 1230 */ 1231 #define SAT_UH2(RTYPE, in0, in1, sat_val) { \ 1232 in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ 1233 in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ 1234 } 1235 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) 1236 1237 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) { \ 1238 SAT_UH2(RTYPE, in0, in1, sat_val); \ 1239 SAT_UH2(RTYPE, in2, in3, sat_val) \ 1240 } 1241 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) 1242 1243 /* Description : Saturate the halfword element values to the max 1244 unsigned value of (sat_val + 1) bits 1245 The element data width remains unchanged 1246 Arguments : Inputs - in0, in1, sat_val 1247 Outputs - in place operation 1248 Return Type - as per RTYPE 1249 Details : Each unsigned halfword element from 'in0' is saturated to the 1250 value generated with (sat_val + 1) bit range 1251 The results are written in place 1252 */ 1253 #define SAT_SH2(RTYPE, in0, in1, sat_val) { \ 1254 in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ 1255 in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ 1256 } 1257 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) 1258 1259 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) { \ 1260 SAT_SH2(RTYPE, in0, in1, sat_val); \ 1261 SAT_SH2(RTYPE, in2, in3, sat_val); \ 1262 } 1263 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) 1264 1265 /* Description : Indexed halfword element values are replicated to all 1266 elements in output vector 1267 Arguments : Inputs - in, idx0, idx1 1268 Outputs - out0, out1 1269 Return Type - as per RTYPE 1270 Details : 'idx0' element value from 'in' vector is replicated to all 1271 elements in 'out0' vector 1272 Valid index range for halfword operation is 0-7 1273 */ 1274 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) { \ 1275 out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ 1276 out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ 1277 } 1278 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) 1279 1280 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \ 1281 out0, out1, out2, out3) { \ 1282 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ 1283 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ 1284 } 1285 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) 1286 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) 1287 1288 /* Description : Pack even byte elements of vector pairs 1289 Arguments : Inputs - in0, in1, in2, in3 1290 Outputs - out0, out1 1291 Return Type - as per RTYPE 1292 Details : Even byte elements of 'in0' are copied to the left half of 1293 'out0' & even byte elements of 'in1' are copied to the right 1294 half of 'out0'. 1295 */ 1296 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ 1297 out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ 1298 out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ 1299 } 1300 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) 1301 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) 1302 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) 1303 1304 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1305 out0, out1, out2, out3) { \ 1306 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1307 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1308 } 1309 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) 1310 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) 1311 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) 1312 1313 /* Description : Pack even halfword elements of vector pairs 1314 Arguments : Inputs - in0, in1, in2, in3 1315 Outputs - out0, out1 1316 Return Type - as per RTYPE 1317 Details : Even halfword elements of 'in0' are copied to the left half of 1318 'out0' & even halfword elements of 'in1' are copied to the 1319 right half of 'out0'. 1320 */ 1321 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ 1322 out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ 1323 out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ 1324 } 1325 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) 1326 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) 1327 1328 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1329 out0, out1, out2, out3) { \ 1330 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1331 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1332 } 1333 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) 1334 1335 /* Description : Pack even double word elements of vector pairs 1336 Arguments : Inputs - in0, in1, in2, in3 1337 Outputs - out0, out1 1338 Return Type - as per RTYPE 1339 Details : Even double elements of 'in0' are copied to the left half of 1340 'out0' & even double elements of 'in1' are copied to the right 1341 half of 'out0'. 1342 */ 1343 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ 1344 out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ 1345 out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ 1346 } 1347 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) 1348 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) 1349 1350 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1351 out0, out1, out2, out3) { \ 1352 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1353 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1354 } 1355 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) 1356 1357 /* Description : Each byte element is logically xor'ed with immediate 128 1358 Arguments : Inputs - in0, in1 1359 Outputs - in place operation 1360 Return Type - as per RTYPE 1361 Details : Each unsigned byte element from input vector 'in0' is 1362 logically xor'ed with 128 and the result is stored in-place. 1363 */ 1364 #define XORI_B2_128(RTYPE, in0, in1) { \ 1365 in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ 1366 in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ 1367 } 1368 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) 1369 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) 1370 1371 #define XORI_B3_128(RTYPE, in0, in1, in2) { \ 1372 XORI_B2_128(RTYPE, in0, in1); \ 1373 in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ 1374 } 1375 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) 1376 1377 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) { \ 1378 XORI_B2_128(RTYPE, in0, in1); \ 1379 XORI_B2_128(RTYPE, in2, in3); \ 1380 } 1381 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) 1382 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) 1383 1384 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) { \ 1385 XORI_B4_128(RTYPE, in0, in1, in2, in3); \ 1386 XORI_B3_128(RTYPE, in4, in5, in6); \ 1387 } 1388 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) 1389 1390 /* Description : Average of signed halfword elements -> (a + b) / 2 1391 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1392 Outputs - out0, out1, out2, out3 1393 Return Type - as per RTYPE 1394 Details : Each signed halfword element from 'in0' is added to each 1395 signed halfword element of 'in1' with full precision resulting 1396 in one extra bit in the result. The result is then divided by 1397 2 and written to 'out0' 1398 */ 1399 #define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1400 out0, out1, out2, out3) { \ 1401 out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ 1402 out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ 1403 out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ 1404 out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ 1405 } 1406 #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) 1407 1408 /* Description : Addition of signed halfword elements and signed saturation 1409 Arguments : Inputs - in0, in1, in2, in3 1410 Outputs - out0, out1 1411 Return Type - as per RTYPE 1412 Details : Signed halfword elements from 'in0' are added to signed 1413 halfword elements of 'in1'. The result is then signed saturated 1414 between halfword data type range 1415 */ 1416 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) { \ 1417 out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ 1418 out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ 1419 } 1420 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) 1421 1422 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1423 out0, out1, out2, out3) { \ 1424 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1425 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1426 } 1427 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) 1428 1429 /* Description : Shift left all elements of vector (generic for all data types) 1430 Arguments : Inputs - in0, in1, in2, in3, shift 1431 Outputs - in place operation 1432 Return Type - as per input vector RTYPE 1433 Details : Each element of vector 'in0' is left shifted by 'shift' and 1434 the result is written in-place. 1435 */ 1436 #define SLLI_4V(in0, in1, in2, in3, shift) { \ 1437 in0 = in0 << shift; \ 1438 in1 = in1 << shift; \ 1439 in2 = in2 << shift; \ 1440 in3 = in3 << shift; \ 1441 } 1442 1443 /* Description : Arithmetic shift right all elements of vector 1444 (generic for all data types) 1445 Arguments : Inputs - in0, in1, in2, in3, shift 1446 Outputs - in place operation 1447 Return Type - as per input vector RTYPE 1448 Details : Each element of vector 'in0' is right shifted by 'shift' and 1449 the result is written in-place. 'shift' is a GP variable. 1450 */ 1451 #define SRA_4V(in0, in1, in2, in3, shift) { \ 1452 in0 = in0 >> shift; \ 1453 in1 = in1 >> shift; \ 1454 in2 = in2 >> shift; \ 1455 in3 = in3 >> shift; \ 1456 } 1457 1458 /* Description : Shift right arithmetic rounded words 1459 Arguments : Inputs - in0, in1, shift 1460 Outputs - in place operation 1461 Return Type - as per RTYPE 1462 Details : Each element of vector 'in0' is shifted right arithmetically by 1463 the number of bits in the corresponding element in the vector 1464 'shift'. The last discarded bit is added to shifted value for 1465 rounding and the result is written in-place. 1466 'shift' is a vector. 1467 */ 1468 #define SRAR_W2(RTYPE, in0, in1, shift) { \ 1469 in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ 1470 in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ 1471 } 1472 1473 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) { \ 1474 SRAR_W2(RTYPE, in0, in1, shift) \ 1475 SRAR_W2(RTYPE, in2, in3, shift) \ 1476 } 1477 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) 1478 1479 /* Description : Shift right arithmetic rounded (immediate) 1480 Arguments : Inputs - in0, in1, shift 1481 Outputs - in place operation 1482 Return Type - as per RTYPE 1483 Details : Each element of vector 'in0' is shifted right arithmetically by 1484 the value in 'shift'. The last discarded bit is added to the 1485 shifted value for rounding and the result is written in-place. 1486 'shift' is an immediate value. 1487 */ 1488 #define SRARI_H2(RTYPE, in0, in1, shift) { \ 1489 in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ 1490 in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ 1491 } 1492 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) 1493 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) 1494 1495 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) { \ 1496 SRARI_H2(RTYPE, in0, in1, shift); \ 1497 SRARI_H2(RTYPE, in2, in3, shift); \ 1498 } 1499 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) 1500 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) 1501 1502 #define SRARI_W2(RTYPE, in0, in1, shift) { \ 1503 in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ 1504 in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ 1505 } 1506 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) 1507 1508 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) { \ 1509 SRARI_W2(RTYPE, in0, in1, shift); \ 1510 SRARI_W2(RTYPE, in2, in3, shift); \ 1511 } 1512 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) 1513 1514 /* Description : Logical shift right all elements of vector (immediate) 1515 Arguments : Inputs - in0, in1, in2, in3, shift 1516 Outputs - out0, out1, out2, out3 1517 Return Type - as per RTYPE 1518 Details : Each element of vector 'in0' is right shifted by 'shift' and 1519 the result is written in-place. 'shift' is an immediate value. 1520 */ 1521 #define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) { \ 1522 out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ 1523 out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ 1524 out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ 1525 out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ 1526 } 1527 #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) 1528 1529 /* Description : Multiplication of pairs of vectors 1530 Arguments : Inputs - in0, in1, in2, in3 1531 Outputs - out0, out1 1532 Details : Each element from 'in0' is multiplied with elements from 'in1' 1533 and the result is written to 'out0' 1534 */ 1535 #define MUL2(in0, in1, in2, in3, out0, out1) { \ 1536 out0 = in0 * in1; \ 1537 out1 = in2 * in3; \ 1538 } 1539 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \ 1540 out0, out1, out2, out3) { \ 1541 MUL2(in0, in1, in2, in3, out0, out1); \ 1542 MUL2(in4, in5, in6, in7, out2, out3); \ 1543 } 1544 1545 /* Description : Addition of 2 pairs of vectors 1546 Arguments : Inputs - in0, in1, in2, in3 1547 Outputs - out0, out1 1548 Details : Each element in 'in0' is added to 'in1' and result is written 1549 to 'out0'. 1550 */ 1551 #define ADD2(in0, in1, in2, in3, out0, out1) { \ 1552 out0 = in0 + in1; \ 1553 out1 = in2 + in3; \ 1554 } 1555 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ 1556 out0, out1, out2, out3) { \ 1557 ADD2(in0, in1, in2, in3, out0, out1); \ 1558 ADD2(in4, in5, in6, in7, out2, out3); \ 1559 } 1560 1561 /* Description : Subtraction of 2 pairs of vectors 1562 Arguments : Inputs - in0, in1, in2, in3 1563 Outputs - out0, out1 1564 Details : Each element in 'in1' is subtracted from 'in0' and result is 1565 written to 'out0'. 1566 */ 1567 #define SUB2(in0, in1, in2, in3, out0, out1) { \ 1568 out0 = in0 - in1; \ 1569 out1 = in2 - in3; \ 1570 } 1571 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \ 1572 out0, out1, out2, out3) { \ 1573 out0 = in0 - in1; \ 1574 out1 = in2 - in3; \ 1575 out2 = in4 - in5; \ 1576 out3 = in6 - in7; \ 1577 } 1578 1579 /* Description : Sign extend halfword elements from right half of the vector 1580 Arguments : Input - in (halfword vector) 1581 Output - out (sign extended word vector) 1582 Return Type - signed word 1583 Details : Sign bit of halfword elements from input vector 'in' is 1584 extracted and interleaved with same vector 'in0' to generate 1585 4 word elements keeping sign intact 1586 */ 1587 #define UNPCK_R_SH_SW(in, out) { \ 1588 v8i16 sign_m; \ 1589 \ 1590 sign_m = __msa_clti_s_h((v8i16)in, 0); \ 1591 out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ 1592 } 1593 1594 /* Description : Zero extend unsigned byte elements to halfword elements 1595 Arguments : Input - in (unsigned byte vector) 1596 Outputs - out0, out1 (unsigned halfword vectors) 1597 Return Type - signed halfword 1598 Details : Zero extended right half of vector is returned in 'out0' 1599 Zero extended left half of vector is returned in 'out1' 1600 */ 1601 #define UNPCK_UB_SH(in, out0, out1) { \ 1602 v16i8 zero_m = { 0 }; \ 1603 \ 1604 ILVRL_B2_SH(zero_m, in, out0, out1); \ 1605 } 1606 1607 /* Description : Sign extend halfword elements from input vector and return 1608 the result in pair of vectors 1609 Arguments : Input - in (halfword vector) 1610 Outputs - out0, out1 (sign extended word vectors) 1611 Return Type - signed word 1612 Details : Sign bit of halfword elements from input vector 'in' is 1613 extracted and interleaved right with same vector 'in0' to 1614 generate 4 signed word elements in 'out0' 1615 Then interleaved left with same vector 'in0' to 1616 generate 4 signed word elements in 'out1' 1617 */ 1618 #define UNPCK_SH_SW(in, out0, out1) { \ 1619 v8i16 tmp_m; \ 1620 \ 1621 tmp_m = __msa_clti_s_h((v8i16)in, 0); \ 1622 ILVRL_H2_SW(tmp_m, in, out0, out1); \ 1623 } 1624 1625 /* Description : Butterfly of 4 input vectors 1626 Arguments : Inputs - in0, in1, in2, in3 1627 Outputs - out0, out1, out2, out3 1628 Details : Butterfly operation 1629 */ 1630 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) { \ 1631 out0 = in0 + in3; \ 1632 out1 = in1 + in2; \ 1633 \ 1634 out2 = in1 - in2; \ 1635 out3 = in0 - in3; \ 1636 } 1637 1638 /* Description : Butterfly of 8 input vectors 1639 Arguments : Inputs - in0 ... in7 1640 Outputs - out0 .. out7 1641 Details : Butterfly operation 1642 */ 1643 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \ 1644 out0, out1, out2, out3, out4, out5, out6, out7) { \ 1645 out0 = in0 + in7; \ 1646 out1 = in1 + in6; \ 1647 out2 = in2 + in5; \ 1648 out3 = in3 + in4; \ 1649 \ 1650 out4 = in3 - in4; \ 1651 out5 = in2 - in5; \ 1652 out6 = in1 - in6; \ 1653 out7 = in0 - in7; \ 1654 } 1655 1656 /* Description : Butterfly of 16 input vectors 1657 Arguments : Inputs - in0 ... in15 1658 Outputs - out0 .. out15 1659 Details : Butterfly operation 1660 */ 1661 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \ 1662 in8, in9, in10, in11, in12, in13, in14, in15, \ 1663 out0, out1, out2, out3, out4, out5, out6, out7, \ 1664 out8, out9, out10, out11, out12, out13, out14, out15) { \ 1665 out0 = in0 + in15; \ 1666 out1 = in1 + in14; \ 1667 out2 = in2 + in13; \ 1668 out3 = in3 + in12; \ 1669 out4 = in4 + in11; \ 1670 out5 = in5 + in10; \ 1671 out6 = in6 + in9; \ 1672 out7 = in7 + in8; \ 1673 \ 1674 out8 = in7 - in8; \ 1675 out9 = in6 - in9; \ 1676 out10 = in5 - in10; \ 1677 out11 = in4 - in11; \ 1678 out12 = in3 - in12; \ 1679 out13 = in2 - in13; \ 1680 out14 = in1 - in14; \ 1681 out15 = in0 - in15; \ 1682 } 1683 1684 /* Description : Transpose input 8x8 byte block 1685 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1686 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1687 Return Type - as per RTYPE 1688 */ 1689 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1690 out0, out1, out2, out3, out4, out5, out6, out7) { \ 1691 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1692 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1693 \ 1694 ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \ 1695 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 1696 ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ 1697 ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ 1698 ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ 1699 ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ 1700 SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ 1701 SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ 1702 } 1703 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) 1704 1705 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors 1706 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 1707 in8, in9, in10, in11, in12, in13, in14, in15 1708 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1709 Return Type - unsigned byte 1710 */ 1711 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 1712 in8, in9, in10, in11, in12, in13, in14, in15, \ 1713 out0, out1, out2, out3, out4, out5, out6, out7) { \ 1714 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1715 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1716 \ 1717 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ 1718 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ 1719 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ 1720 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ 1721 \ 1722 tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ 1723 tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ 1724 tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ 1725 tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ 1726 out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ 1727 tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ 1728 out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ 1729 tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ 1730 \ 1731 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ 1732 out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1733 out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1734 \ 1735 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ 1736 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ 1737 out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1738 out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1739 \ 1740 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ 1741 out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1742 out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1743 \ 1744 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ 1745 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ 1746 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ 1747 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ 1748 out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1749 out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1750 } 1751 1752 /* Description : Transpose 4x4 block with half word elements in vectors 1753 Arguments : Inputs - in0, in1, in2, in3 1754 Outputs - out0, out1, out2, out3 1755 Return Type - signed halfword 1756 */ 1757 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \ 1758 v8i16 s0_m, s1_m; \ 1759 \ 1760 ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ 1761 ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ 1762 out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ 1763 out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ 1764 } 1765 1766 /* Description : Transpose 4x8 block with half word elements in vectors 1767 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1768 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1769 Return Type - signed halfword 1770 */ 1771 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, \ 1772 out0, out1, out2, out3, out4, out5, out6, out7) { \ 1773 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1774 v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ 1775 v8i16 zero_m = { 0 }; \ 1776 \ 1777 ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ 1778 tmp0_n, tmp1_n, tmp2_n, tmp3_n); \ 1779 ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ 1780 ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ 1781 \ 1782 out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ 1783 out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ 1784 out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ 1785 out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ 1786 \ 1787 out4 = zero_m; \ 1788 out5 = zero_m; \ 1789 out6 = zero_m; \ 1790 out7 = zero_m; \ 1791 } 1792 1793 /* Description : Transpose 8x4 block with half word elements in vectors 1794 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1795 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1796 Return Type - signed halfword 1797 */ 1798 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \ 1799 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1800 \ 1801 ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ 1802 ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ 1803 ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ 1804 ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ 1805 } 1806 1807 /* Description : Transpose 8x8 block with half word elements in vectors 1808 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1809 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1810 Return Type - as per RTYPE 1811 */ 1812 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1813 out0, out1, out2, out3, out4, out5, out6, out7) { \ 1814 v8i16 s0_m, s1_m; \ 1815 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1816 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1817 \ 1818 ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ 1819 ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ 1820 ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ 1821 ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ 1822 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 1823 ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ 1824 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 1825 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ 1826 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \ 1827 tmp3_m, tmp7_m, out0, out2, out4, out6); \ 1828 out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ 1829 out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ 1830 out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ 1831 out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ 1832 } 1833 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) 1834 1835 /* Description : Transpose 4x4 block with word elements in vectors 1836 Arguments : Inputs - in0, in1, in2, in3 1837 Outputs - out0, out1, out2, out3 1838 Return Type - signed word 1839 */ 1840 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) { \ 1841 v4i32 s0_m, s1_m, s2_m, s3_m; \ 1842 \ 1843 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ 1844 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ 1845 \ 1846 out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ 1847 out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ 1848 out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ 1849 out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ 1850 } 1851 1852 /* Description : Add block 4x4 1853 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 1854 Details : Least significant 4 bytes from each input vector are added to 1855 the destination bytes, clipped between 0-255 and stored. 1856 */ 1857 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) { \ 1858 uint32_t src0_m, src1_m, src2_m, src3_m; \ 1859 v8i16 inp0_m, inp1_m, res0_m, res1_m; \ 1860 v16i8 dst0_m = { 0 }; \ 1861 v16i8 dst1_m = { 0 }; \ 1862 v16i8 zero_m = { 0 }; \ 1863 \ 1864 ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ 1865 LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ 1866 INSERT_W2_SB(src0_m, src1_m, dst0_m); \ 1867 INSERT_W2_SB(src2_m, src3_m, dst1_m); \ 1868 ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ 1869 ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ 1870 CLIP_SH2_0_255(res0_m, res1_m); \ 1871 PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ 1872 ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ 1873 } 1874 1875 /* Description : Pack even elements of input vectors & xor with 128 1876 Arguments : Inputs - in0, in1 1877 Output - out_m 1878 Return Type - unsigned byte 1879 Details : Signed byte even elements from 'in0' and 'in1' are packed 1880 together in one vector and the resulting vector is xor'ed with 1881 128 to shift the range from signed to unsigned byte 1882 */ 1883 #define PCKEV_XORI128_UB(in0, in1) ({ \ 1884 v16u8 out_m; \ 1885 \ 1886 out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ 1887 out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ 1888 out_m; \ 1889 }) 1890 1891 /* Description : Converts inputs to unsigned bytes, interleave, average & store 1892 as 8x4 unsigned byte block 1893 Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, 1894 pdst, stride 1895 */ 1896 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \ 1897 dst0, dst1, dst2, dst3, pdst, stride) { \ 1898 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1899 uint8_t *pdst_m = (uint8_t *)(pdst); \ 1900 \ 1901 tmp0_m = PCKEV_XORI128_UB(in0, in1); \ 1902 tmp1_m = PCKEV_XORI128_UB(in2, in3); \ 1903 ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ 1904 AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ 1905 ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ 1906 } 1907 1908 /* Description : Pack even byte elements and store byte vector in destination 1909 memory 1910 Arguments : Inputs - in0, in1, pdst 1911 */ 1912 #define PCKEV_ST_SB(in0, in1, pdst) { \ 1913 v16i8 tmp_m; \ 1914 \ 1915 tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ 1916 ST_SB(tmp_m, (pdst)); \ 1917 } 1918 1919 /* Description : Horizontal 2 tap filter kernel code 1920 Arguments : Inputs - in0, in1, mask, coeff, shift 1921 */ 1922 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({ \ 1923 v16i8 tmp0_m; \ 1924 v8u16 tmp1_m; \ 1925 \ 1926 tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ 1927 tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ 1928 tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ 1929 \ 1930 tmp1_m; \ 1931 }) 1932 #endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */ 1933