1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_VPX_DSP_MIPS_MACROS_MSA_H_ 12 #define VPX_VPX_DSP_MIPS_MACROS_MSA_H_ 13 14 #include <msa.h> 15 16 #include "./vpx_config.h" 17 #include "vpx/vpx_integer.h" 18 19 #define LD_V(RTYPE, psrc) *((const RTYPE *)(psrc)) 20 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__) 21 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__) 22 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__) 23 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__) 24 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__) 25 26 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 27 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__) 28 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__) 29 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__) 30 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__) 31 32 #if (__mips_isa_rev >= 6) 33 #define LH(psrc) \ 34 ({ \ 35 uint16_t val_lh_m = *(const uint16_t *)(psrc); \ 36 val_lh_m; \ 37 }) 38 39 #define LW(psrc) \ 40 ({ \ 41 uint32_t val_lw_m = *(const uint32_t *)(psrc); \ 42 val_lw_m; \ 43 }) 44 45 #if (__mips == 64) 46 #define LD(psrc) \ 47 ({ \ 48 uint64_t val_ld_m = *(const uint64_t *)(psrc); \ 49 val_ld_m; \ 50 }) 51 #else // !(__mips == 64) 52 #define LD(psrc) \ 53 ({ \ 54 const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ 55 uint32_t val0_ld_m, val1_ld_m; \ 56 uint64_t val_ld_m = 0; \ 57 \ 58 val0_ld_m = LW(psrc_ld_m); \ 59 val1_ld_m = LW(psrc_ld_m + 4); \ 60 \ 61 val_ld_m = (uint64_t)(val1_ld_m); \ 62 val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \ 63 val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m); \ 64 \ 65 val_ld_m; \ 66 }) 67 #endif // (__mips == 64) 68 69 #define SH(val, pdst) *(uint16_t *)(pdst) = (val); 70 #define SW(val, pdst) *(uint32_t *)(pdst) = (val); 71 #define SD(val, pdst) *(uint64_t *)(pdst) = (val); 72 #else // !(__mips_isa_rev >= 6) 73 #define LH(psrc) \ 74 ({ \ 75 const uint8_t *psrc_lh_m = (const uint8_t *)(psrc); \ 76 uint16_t val_lh_m; \ 77 \ 78 __asm__ __volatile__("ulh %[val_lh_m], %[psrc_lh_m] \n\t" \ 79 \ 80 : [val_lh_m] "=r"(val_lh_m) \ 81 : [psrc_lh_m] "m"(*psrc_lh_m)); \ 82 \ 83 val_lh_m; \ 84 }) 85 86 #define LW(psrc) \ 87 ({ \ 88 const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \ 89 uint32_t val_lw_m; \ 90 \ 91 __asm__ __volatile__( \ 92 "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ 93 "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ 94 : [val_lw_m] "=&r"(val_lw_m) \ 95 : [psrc_lw_m] "r"(psrc_lw_m)); \ 96 \ 97 val_lw_m; \ 98 }) 99 100 #if (__mips == 64) 101 #define LD(psrc) \ 102 ({ \ 103 const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ 104 uint64_t val_ld_m = 0; \ 105 \ 106 __asm__ __volatile__( \ 107 "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ 108 "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ 109 : [val_ld_m] "=&r"(val_ld_m) \ 110 : [psrc_ld_m] "r"(psrc_ld_m)); \ 111 \ 112 val_ld_m; \ 113 }) 114 #else // !(__mips == 64) 115 #define LD(psrc) \ 116 ({ \ 117 const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ 118 uint32_t val0_ld_m, val1_ld_m; \ 119 uint64_t val_ld_m = 0; \ 120 \ 121 val0_ld_m = LW(psrc_ld_m); \ 122 val1_ld_m = LW(psrc_ld_m + 4); \ 123 \ 124 val_ld_m = (uint64_t)(val1_ld_m); \ 125 val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \ 126 val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m); \ 127 \ 128 val_ld_m; \ 129 }) 130 #endif // (__mips == 64) 131 132 #define SH(val, pdst) \ 133 { \ 134 uint8_t *pdst_sh_m = (uint8_t *)(pdst); \ 135 const uint16_t val_sh_m = (val); \ 136 \ 137 __asm__ __volatile__("ush %[val_sh_m], %[pdst_sh_m] \n\t" \ 138 \ 139 : [pdst_sh_m] "=m"(*pdst_sh_m) \ 140 : [val_sh_m] "r"(val_sh_m)); \ 141 } 142 143 #define SW(val, pdst) \ 144 { \ 145 uint8_t *pdst_sw_m = (uint8_t *)(pdst); \ 146 const uint32_t val_sw_m = (val); \ 147 \ 148 __asm__ __volatile__("usw %[val_sw_m], %[pdst_sw_m] \n\t" \ 149 \ 150 : [pdst_sw_m] "=m"(*pdst_sw_m) \ 151 : [val_sw_m] "r"(val_sw_m)); \ 152 } 153 154 #define SD(val, pdst) \ 155 { \ 156 uint8_t *pdst_sd_m = (uint8_t *)(pdst); \ 157 uint32_t val0_sd_m, val1_sd_m; \ 158 \ 159 val0_sd_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ 160 val1_sd_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ 161 \ 162 SW(val0_sd_m, pdst_sd_m); \ 163 SW(val1_sd_m, pdst_sd_m + 4); \ 164 } 165 #endif // (__mips_isa_rev >= 6) 166 167 /* Description : Load 4 words with stride 168 Arguments : Inputs - psrc, stride 169 Outputs - out0, out1, out2, out3 170 Details : Load word in 'out0' from (psrc) 171 Load word in 'out1' from (psrc + stride) 172 Load word in 'out2' from (psrc + 2 * stride) 173 Load word in 'out3' from (psrc + 3 * stride) 174 */ 175 #define LW4(psrc, stride, out0, out1, out2, out3) \ 176 { \ 177 out0 = LW((psrc)); \ 178 out1 = LW((psrc) + stride); \ 179 out2 = LW((psrc) + 2 * stride); \ 180 out3 = LW((psrc) + 3 * stride); \ 181 } 182 183 /* Description : Load double words with stride 184 Arguments : Inputs - psrc, stride 185 Outputs - out0, out1 186 Details : Load double word in 'out0' from (psrc) 187 Load double word in 'out1' from (psrc + stride) 188 */ 189 #define LD2(psrc, stride, out0, out1) \ 190 { \ 191 out0 = LD((psrc)); \ 192 out1 = LD((psrc) + stride); \ 193 } 194 #define LD4(psrc, stride, out0, out1, out2, out3) \ 195 { \ 196 LD2((psrc), stride, out0, out1); \ 197 LD2((psrc) + 2 * stride, stride, out2, out3); \ 198 } 199 200 /* Description : Store 4 words with stride 201 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 202 Details : Store word from 'in0' to (pdst) 203 Store word from 'in1' to (pdst + stride) 204 Store word from 'in2' to (pdst + 2 * stride) 205 Store word from 'in3' to (pdst + 3 * stride) 206 */ 207 #define SW4(in0, in1, in2, in3, pdst, stride) \ 208 { \ 209 SW(in0, (pdst)) \ 210 SW(in1, (pdst) + stride); \ 211 SW(in2, (pdst) + 2 * stride); \ 212 SW(in3, (pdst) + 3 * stride); \ 213 } 214 215 /* Description : Store 4 double words with stride 216 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 217 Details : Store double word from 'in0' to (pdst) 218 Store double word from 'in1' to (pdst + stride) 219 Store double word from 'in2' to (pdst + 2 * stride) 220 Store double word from 'in3' to (pdst + 3 * stride) 221 */ 222 #define SD4(in0, in1, in2, in3, pdst, stride) \ 223 { \ 224 SD(in0, (pdst)) \ 225 SD(in1, (pdst) + stride); \ 226 SD(in2, (pdst) + 2 * stride); \ 227 SD(in3, (pdst) + 3 * stride); \ 228 } 229 230 /* Description : Load vector elements with stride 231 Arguments : Inputs - psrc, stride 232 Outputs - out0, out1 233 Return Type - as per RTYPE 234 Details : Load 16 byte elements in 'out0' from (psrc) 235 Load 16 byte elements in 'out1' from (psrc + stride) 236 */ 237 #define LD_V2(RTYPE, psrc, stride, out0, out1) \ 238 { \ 239 out0 = LD_V(RTYPE, (psrc)); \ 240 out1 = LD_V(RTYPE, (psrc) + stride); \ 241 } 242 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__) 243 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__) 244 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__) 245 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__) 246 247 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \ 248 { \ 249 LD_V2(RTYPE, (psrc), stride, out0, out1); \ 250 out2 = LD_V(RTYPE, (psrc) + 2 * stride); \ 251 } 252 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__) 253 254 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 255 { \ 256 LD_V2(RTYPE, (psrc), stride, out0, out1); \ 257 LD_V2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ 258 } 259 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__) 260 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__) 261 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__) 262 263 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ 264 { \ 265 LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 266 out4 = LD_V(RTYPE, (psrc) + 4 * stride); \ 267 } 268 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__) 269 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__) 270 271 #define LD_V7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ 272 { \ 273 LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ 274 LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ 275 } 276 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__) 277 278 #define LD_V8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ 279 out7) \ 280 { \ 281 LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 282 LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ 283 } 284 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__) 285 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__) 286 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__) 287 288 #define LD_V16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ 289 out7, out8, out9, out10, out11, out12, out13, out14, out15) \ 290 { \ 291 LD_V8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ 292 out7); \ 293 LD_V8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ 294 out13, out14, out15); \ 295 } 296 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__) 297 298 /* Description : Load 4x4 block of signed halfword elements from 1D source 299 data into 4 vectors (Each vector with 4 signed halfwords) 300 Arguments : Input - psrc 301 Outputs - out0, out1, out2, out3 302 */ 303 #define LD4x4_SH(psrc, out0, out1, out2, out3) \ 304 { \ 305 out0 = LD_SH(psrc); \ 306 out2 = LD_SH(psrc + 8); \ 307 out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ 308 out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ 309 } 310 311 /* Description : Store vectors with stride 312 Arguments : Inputs - in0, in1, pdst, stride 313 Details : Store 16 byte elements from 'in0' to (pdst) 314 Store 16 byte elements from 'in1' to (pdst + stride) 315 */ 316 #define ST_V2(RTYPE, in0, in1, pdst, stride) \ 317 { \ 318 ST_V(RTYPE, in0, (pdst)); \ 319 ST_V(RTYPE, in1, (pdst) + stride); \ 320 } 321 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__) 322 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__) 323 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__) 324 325 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \ 326 { \ 327 ST_V2(RTYPE, in0, in1, (pdst), stride); \ 328 ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 329 } 330 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__) 331 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__) 332 333 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 334 { \ 335 ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride); \ 336 ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ 337 } 338 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__) 339 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__) 340 341 /* Description : Store 2x4 byte block to destination memory from input vector 342 Arguments : Inputs - in, stidx, pdst, stride 343 Details : Index 'stidx' halfword element from 'in' vector is copied to 344 the GP register and stored to (pdst) 345 Index 'stidx+1' halfword element from 'in' vector is copied to 346 the GP register and stored to (pdst + stride) 347 Index 'stidx+2' halfword element from 'in' vector is copied to 348 the GP register and stored to (pdst + 2 * stride) 349 Index 'stidx+3' halfword element from 'in' vector is copied to 350 the GP register and stored to (pdst + 3 * stride) 351 */ 352 #define ST2x4_UB(in, stidx, pdst, stride) \ 353 { \ 354 uint16_t out0_m, out1_m, out2_m, out3_m; \ 355 uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ 356 \ 357 out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ 358 out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ 359 out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ 360 out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ 361 \ 362 SH(out0_m, pblk_2x4_m); \ 363 SH(out1_m, pblk_2x4_m + stride); \ 364 SH(out2_m, pblk_2x4_m + 2 * stride); \ 365 SH(out3_m, pblk_2x4_m + 3 * stride); \ 366 } 367 368 /* Description : Store 4x2 byte block to destination memory from input vector 369 Arguments : Inputs - in, pdst, stride 370 Details : Index 0 word element from 'in' vector is copied to the GP 371 register and stored to (pdst) 372 Index 1 word element from 'in' vector is copied to the GP 373 register and stored to (pdst + stride) 374 */ 375 #define ST4x2_UB(in, pdst, stride) \ 376 { \ 377 uint32_t out0_m, out1_m; \ 378 uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ 379 \ 380 out0_m = __msa_copy_u_w((v4i32)in, 0); \ 381 out1_m = __msa_copy_u_w((v4i32)in, 1); \ 382 \ 383 SW(out0_m, pblk_4x2_m); \ 384 SW(out1_m, pblk_4x2_m + stride); \ 385 } 386 387 /* Description : Store 4x4 byte block to destination memory from input vector 388 Arguments : Inputs - in0, in1, pdst, stride 389 Details : 'Idx0' word element from input vector 'in0' is copied to the 390 GP register and stored to (pdst) 391 'Idx1' word element from input vector 'in0' is copied to the 392 GP register and stored to (pdst + stride) 393 'Idx2' word element from input vector 'in0' is copied to the 394 GP register and stored to (pdst + 2 * stride) 395 'Idx3' word element from input vector 'in0' is copied to the 396 GP register and stored to (pdst + 3 * stride) 397 */ 398 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ 399 { \ 400 uint32_t out0_m, out1_m, out2_m, out3_m; \ 401 uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ 402 \ 403 out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ 404 out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ 405 out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ 406 out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ 407 \ 408 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ 409 } 410 #define ST4x8_UB(in0, in1, pdst, stride) \ 411 { \ 412 uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ 413 \ 414 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ 415 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ 416 } 417 418 /* Description : Store 8x1 byte block to destination memory from input vector 419 Arguments : Inputs - in, pdst 420 Details : Index 0 double word element from 'in' vector is copied to the 421 GP register and stored to (pdst) 422 */ 423 #define ST8x1_UB(in, pdst) \ 424 { \ 425 uint64_t out0_m; \ 426 \ 427 out0_m = __msa_copy_u_d((v2i64)in, 0); \ 428 SD(out0_m, pdst); \ 429 } 430 431 /* Description : Store 8x2 byte block to destination memory from input vector 432 Arguments : Inputs - in, pdst, stride 433 Details : Index 0 double word element from 'in' vector is copied to the 434 GP register and stored to (pdst) 435 Index 1 double word element from 'in' vector is copied to the 436 GP register and stored to (pdst + stride) 437 */ 438 #define ST8x2_UB(in, pdst, stride) \ 439 { \ 440 uint64_t out0_m, out1_m; \ 441 uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ 442 \ 443 out0_m = __msa_copy_u_d((v2i64)in, 0); \ 444 out1_m = __msa_copy_u_d((v2i64)in, 1); \ 445 \ 446 SD(out0_m, pblk_8x2_m); \ 447 SD(out1_m, pblk_8x2_m + stride); \ 448 } 449 450 /* Description : Store 8x4 byte block to destination memory from input 451 vectors 452 Arguments : Inputs - in0, in1, pdst, stride 453 Details : Index 0 double word element from 'in0' vector is copied to the 454 GP register and stored to (pdst) 455 Index 1 double word element from 'in0' vector is copied to the 456 GP register and stored to (pdst + stride) 457 Index 0 double word element from 'in1' vector is copied to the 458 GP register and stored to (pdst + 2 * stride) 459 Index 1 double word element from 'in1' vector is copied to the 460 GP register and stored to (pdst + 3 * stride) 461 */ 462 #define ST8x4_UB(in0, in1, pdst, stride) \ 463 { \ 464 uint64_t out0_m, out1_m, out2_m, out3_m; \ 465 uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ 466 \ 467 out0_m = __msa_copy_u_d((v2i64)in0, 0); \ 468 out1_m = __msa_copy_u_d((v2i64)in0, 1); \ 469 out2_m = __msa_copy_u_d((v2i64)in1, 0); \ 470 out3_m = __msa_copy_u_d((v2i64)in1, 1); \ 471 \ 472 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ 473 } 474 475 /* Description : average with rounding (in0 + in1 + 1) / 2. 476 Arguments : Inputs - in0, in1, in2, in3, 477 Outputs - out0, out1 478 Return Type - as per RTYPE 479 Details : Each unsigned byte element from 'in0' vector is added with 480 each unsigned byte element from 'in1' vector. Then the average 481 with rounding is calculated and written to 'out0' 482 */ 483 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ 484 { \ 485 out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ 486 out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ 487 } 488 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) 489 490 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 491 out2, out3) \ 492 { \ 493 AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ 494 AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ 495 } 496 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) 497 498 /* Description : Immediate number of elements to slide with zero 499 Arguments : Inputs - in0, in1, slide_val 500 Outputs - out0, out1 501 Return Type - as per RTYPE 502 Details : Byte elements from 'zero_m' vector are slid into 'in0' by 503 value specified in the 'slide_val' 504 */ 505 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ 506 { \ 507 v16i8 zero_m = { 0 }; \ 508 out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ 509 out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ 510 } 511 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) 512 513 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \ 514 slide_val) \ 515 { \ 516 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ 517 SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ 518 } 519 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) 520 521 /* Description : Immediate number of elements to slide 522 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val 523 Outputs - out0, out1 524 Return Type - as per RTYPE 525 Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by 526 value specified in the 'slide_val' 527 */ 528 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ 529 { \ 530 out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ 531 out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ 532 } 533 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) 534 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) 535 536 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \ 537 out2, slide_val) \ 538 { \ 539 SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ 540 out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ 541 } 542 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) 543 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) 544 545 /* Description : Shuffle byte vector elements as per mask vector 546 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 547 Outputs - out0, out1 548 Return Type - as per RTYPE 549 Details : Byte elements from 'in0' & 'in1' are copied selectively to 550 'out0' as per control vector 'mask0' 551 */ 552 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 553 { \ 554 out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ 555 out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ 556 } 557 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) 558 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) 559 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) 560 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__) 561 562 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \ 563 out3) \ 564 { \ 565 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ 566 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ 567 } 568 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) 569 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) 570 571 /* Description : Dot product of byte vector elements 572 Arguments : Inputs - mult0, mult1, cnst0, cnst1 573 Outputs - out0, out1 574 Return Type - as per RTYPE 575 Details : Unsigned byte elements from 'mult0' are multiplied with 576 unsigned byte elements from 'cnst0' producing a result 577 twice the size of input i.e. unsigned halfword. 578 The multiplication result of adjacent odd-even elements 579 are added together and written to the 'out0' vector 580 */ 581 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 582 { \ 583 out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ 584 out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ 585 } 586 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) 587 588 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 589 cnst3, out0, out1, out2, out3) \ 590 { \ 591 DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 592 DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 593 } 594 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) 595 596 /* Description : Dot product of byte vector elements 597 Arguments : Inputs - mult0, mult1, cnst0, cnst1 598 Outputs - out0, out1 599 Return Type - as per RTYPE 600 Details : Signed byte elements from 'mult0' are multiplied with 601 signed byte elements from 'cnst0' producing a result 602 twice the size of input i.e. signed halfword. 603 The multiplication result of adjacent odd-even elements 604 are added together and written to the 'out0' vector 605 */ 606 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 607 { \ 608 out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ 609 out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ 610 } 611 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) 612 613 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 614 cnst3, out0, out1, out2, out3) \ 615 { \ 616 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 617 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 618 } 619 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) 620 621 /* Description : Dot product of halfword vector elements 622 Arguments : Inputs - mult0, mult1, cnst0, cnst1 623 Outputs - out0, out1 624 Return Type - as per RTYPE 625 Details : Signed halfword elements from 'mult0' are multiplied with 626 signed halfword elements from 'cnst0' producing a result 627 twice the size of input i.e. signed word. 628 The multiplication result of adjacent odd-even elements 629 are added together and written to the 'out0' vector 630 */ 631 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 632 { \ 633 out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ 634 out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ 635 } 636 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) 637 638 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 639 cnst3, out0, out1, out2, out3) \ 640 { \ 641 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 642 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 643 } 644 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) 645 646 /* Description : Dot product of word vector elements 647 Arguments : Inputs - mult0, mult1, cnst0, cnst1 648 Outputs - out0, out1 649 Return Type - as per RTYPE 650 Details : Signed word elements from 'mult0' are multiplied with 651 signed word elements from 'cnst0' producing a result 652 twice the size of input i.e. signed double word. 653 The multiplication result of adjacent odd-even elements 654 are added together and written to the 'out0' vector 655 */ 656 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 657 { \ 658 out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ 659 out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ 660 } 661 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) 662 663 /* Description : Dot product & addition of byte vector elements 664 Arguments : Inputs - mult0, mult1, cnst0, cnst1 665 Outputs - out0, out1 666 Return Type - as per RTYPE 667 Details : Signed byte elements from 'mult0' are multiplied with 668 signed byte elements from 'cnst0' producing a result 669 twice the size of input i.e. signed halfword. 670 The multiplication result of adjacent odd-even elements 671 are added to the 'out0' vector 672 */ 673 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 674 { \ 675 out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ 676 out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ 677 } 678 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) 679 680 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 681 cnst3, out0, out1, out2, out3) \ 682 { \ 683 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 684 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 685 } 686 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) 687 688 /* Description : Dot product & addition of halfword vector elements 689 Arguments : Inputs - mult0, mult1, cnst0, cnst1 690 Outputs - out0, out1 691 Return Type - as per RTYPE 692 Details : Signed halfword elements from 'mult0' are multiplied with 693 signed halfword elements from 'cnst0' producing a result 694 twice the size of input i.e. signed word. 695 The multiplication result of adjacent odd-even elements 696 are added to the 'out0' vector 697 */ 698 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 699 { \ 700 out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ 701 out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ 702 } 703 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) 704 705 /* Description : Dot product & addition of double word vector elements 706 Arguments : Inputs - mult0, mult1 707 Outputs - out0, out1 708 Return Type - as per RTYPE 709 Details : Each signed word element from 'mult0' is multiplied with itself 710 producing an intermediate result twice the size of input 711 i.e. signed double word 712 The multiplication result of adjacent odd-even elements 713 are added to the 'out0' vector 714 */ 715 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ 716 { \ 717 out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ 718 out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ 719 } 720 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) 721 722 /* Description : Minimum values between unsigned elements of 723 either vector are copied to the output vector 724 Arguments : Inputs - in0, in1, min_vec 725 Outputs - in place operation 726 Return Type - as per RTYPE 727 Details : Minimum of unsigned halfword element values from 'in0' and 728 'min_vec' are written to output vector 'in0' 729 */ 730 #define MIN_UH2(RTYPE, in0, in1, min_vec) \ 731 { \ 732 in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ 733 in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ 734 } 735 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) 736 737 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \ 738 { \ 739 MIN_UH2(RTYPE, in0, in1, min_vec); \ 740 MIN_UH2(RTYPE, in2, in3, min_vec); \ 741 } 742 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) 743 744 /* Description : Clips all signed halfword elements of input vector 745 between 0 & 255 746 Arguments : Input - in 747 Output - out_m 748 Return Type - signed halfword 749 */ 750 #define CLIP_SH_0_255(in) \ 751 ({ \ 752 v8i16 max_m = __msa_ldi_h(255); \ 753 v8i16 out_m; \ 754 \ 755 out_m = __msa_maxi_s_h((v8i16)in, 0); \ 756 out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ 757 out_m; \ 758 }) 759 #define CLIP_SH2_0_255(in0, in1) \ 760 { \ 761 in0 = CLIP_SH_0_255(in0); \ 762 in1 = CLIP_SH_0_255(in1); \ 763 } 764 #define CLIP_SH4_0_255(in0, in1, in2, in3) \ 765 { \ 766 CLIP_SH2_0_255(in0, in1); \ 767 CLIP_SH2_0_255(in2, in3); \ 768 } 769 770 /* Description : Horizontal addition of 4 signed word elements of input vector 771 Arguments : Input - in (signed word vector) 772 Output - sum_m (i32 sum) 773 Return Type - signed word (GP) 774 Details : 4 signed word elements of 'in' vector are added together and 775 the resulting integer sum is returned 776 */ 777 #define HADD_SW_S32(in) \ 778 ({ \ 779 v2i64 res0_m, res1_m; \ 780 int32_t sum_m; \ 781 \ 782 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ 783 res1_m = __msa_splati_d(res0_m, 1); \ 784 res0_m = res0_m + res1_m; \ 785 sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ 786 sum_m; \ 787 }) 788 789 /* Description : Horizontal addition of 4 unsigned word elements 790 Arguments : Input - in (unsigned word vector) 791 Output - sum_m (u32 sum) 792 Return Type - unsigned word (GP) 793 Details : 4 unsigned word elements of 'in' vector are added together and 794 the resulting integer sum is returned 795 */ 796 #define HADD_UW_U32(in) \ 797 ({ \ 798 v2u64 res0_m, res1_m; \ 799 uint32_t sum_m; \ 800 \ 801 res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \ 802 res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ 803 res0_m += res1_m; \ 804 sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ 805 sum_m; \ 806 }) 807 808 /* Description : Horizontal addition of 8 unsigned halfword elements 809 Arguments : Input - in (unsigned halfword vector) 810 Output - sum_m (u32 sum) 811 Return Type - unsigned word 812 Details : 8 unsigned halfword elements of 'in' vector are added 813 together and the resulting integer sum is returned 814 */ 815 #define HADD_UH_U32(in) \ 816 ({ \ 817 v4u32 res_m; \ 818 uint32_t sum_m; \ 819 \ 820 res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ 821 sum_m = HADD_UW_U32(res_m); \ 822 sum_m; \ 823 }) 824 825 /* Description : Horizontal addition of unsigned byte vector elements 826 Arguments : Inputs - in0, in1 827 Outputs - out0, out1 828 Return Type - as per RTYPE 829 Details : Each unsigned odd byte element from 'in0' is added to 830 even unsigned byte element from 'in0' (pairwise) and the 831 halfword result is written to 'out0' 832 */ 833 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \ 834 { \ 835 out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ 836 out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ 837 } 838 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) 839 840 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ 841 { \ 842 HADD_UB2(RTYPE, in0, in1, out0, out1); \ 843 HADD_UB2(RTYPE, in2, in3, out2, out3); \ 844 } 845 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) 846 847 /* Description : Horizontal subtraction of unsigned byte vector elements 848 Arguments : Inputs - in0, in1 849 Outputs - out0, out1 850 Return Type - as per RTYPE 851 Details : Each unsigned odd byte element from 'in0' is subtracted from 852 even unsigned byte element from 'in0' (pairwise) and the 853 halfword result is written to 'out0' 854 */ 855 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ 856 { \ 857 out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ 858 out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ 859 } 860 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) 861 862 /* Description : SAD (Sum of Absolute Difference) 863 Arguments : Inputs - in0, in1, ref0, ref1 864 Outputs - sad_m (halfword vector) 865 Return Type - unsigned halfword 866 Details : Absolute difference of all the byte elements from 'in0' with 867 'ref0' is calculated and preserved in 'diff0'. Then even-odd 868 pairs are added together to generate 8 halfword results. 869 */ 870 #define SAD_UB2_UH(in0, in1, ref0, ref1) \ 871 ({ \ 872 v16u8 diff0_m, diff1_m; \ 873 v8u16 sad_m = { 0 }; \ 874 \ 875 diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \ 876 diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \ 877 \ 878 sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \ 879 sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \ 880 \ 881 sad_m; \ 882 }) 883 884 /* Description : Horizontal subtraction of signed halfword vector elements 885 Arguments : Inputs - in0, in1 886 Outputs - out0, out1 887 Return Type - as per RTYPE 888 Details : Each signed odd halfword element from 'in0' is subtracted from 889 even signed halfword element from 'in0' (pairwise) and the 890 word result is written to 'out0' 891 */ 892 #define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ 893 { \ 894 out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ 895 out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ 896 } 897 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) 898 899 /* Description : Set element n input vector to GPR value 900 Arguments : Inputs - in0, in1, in2, in3 901 Output - out 902 Return Type - as per RTYPE 903 Details : Set element 0 in vector 'out' to value specified in 'in0' 904 */ 905 #define INSERT_W2(RTYPE, in0, in1, out) \ 906 { \ 907 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ 908 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ 909 } 910 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) 911 912 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \ 913 { \ 914 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ 915 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ 916 out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ 917 out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ 918 } 919 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) 920 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) 921 922 #define INSERT_D2(RTYPE, in0, in1, out) \ 923 { \ 924 out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ 925 out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ 926 } 927 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) 928 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) 929 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__) 930 931 /* Description : Interleave even byte elements from vectors 932 Arguments : Inputs - in0, in1, in2, in3 933 Outputs - out0, out1 934 Return Type - as per RTYPE 935 Details : Even byte elements of 'in0' and 'in1' are interleaved 936 and written to 'out0' 937 */ 938 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 939 { \ 940 out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ 941 out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ 942 } 943 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) 944 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) 945 946 /* Description : Interleave even halfword elements from vectors 947 Arguments : Inputs - in0, in1, in2, in3 948 Outputs - out0, out1 949 Return Type - as per RTYPE 950 Details : Even halfword elements of 'in0' and 'in1' are interleaved 951 and written to 'out0' 952 */ 953 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 954 { \ 955 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ 956 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ 957 } 958 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) 959 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) 960 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) 961 962 /* Description : Interleave even word elements from vectors 963 Arguments : Inputs - in0, in1, in2, in3 964 Outputs - out0, out1 965 Return Type - as per RTYPE 966 Details : Even word elements of 'in0' and 'in1' are interleaved 967 and written to 'out0' 968 */ 969 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 970 { \ 971 out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ 972 out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ 973 } 974 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) 975 976 /* Description : Interleave even double word elements from vectors 977 Arguments : Inputs - in0, in1, in2, in3 978 Outputs - out0, out1 979 Return Type - as per RTYPE 980 Details : Even double word elements of 'in0' and 'in1' are interleaved 981 and written to 'out0' 982 */ 983 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 984 { \ 985 out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ 986 out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ 987 } 988 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) 989 990 /* Description : Interleave left half of byte elements from vectors 991 Arguments : Inputs - in0, in1, in2, in3 992 Outputs - out0, out1 993 Return Type - as per RTYPE 994 Details : Left half of byte elements of 'in0' and 'in1' are interleaved 995 and written to 'out0'. 996 */ 997 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 998 { \ 999 out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 1000 out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ 1001 } 1002 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) 1003 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) 1004 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) 1005 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) 1006 1007 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1008 out2, out3) \ 1009 { \ 1010 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1011 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1012 } 1013 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) 1014 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) 1015 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) 1016 1017 /* Description : Interleave left half of halfword elements from vectors 1018 Arguments : Inputs - in0, in1, in2, in3 1019 Outputs - out0, out1 1020 Return Type - as per RTYPE 1021 Details : Left half of halfword elements of 'in0' and 'in1' are 1022 interleaved and written to 'out0'. 1023 */ 1024 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1025 { \ 1026 out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 1027 out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ 1028 } 1029 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) 1030 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) 1031 1032 /* Description : Interleave left half of word elements from vectors 1033 Arguments : Inputs - in0, in1, in2, in3 1034 Outputs - out0, out1 1035 Return Type - as per RTYPE 1036 Details : Left half of word elements of 'in0' and 'in1' are interleaved 1037 and written to 'out0'. 1038 */ 1039 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1040 { \ 1041 out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 1042 out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ 1043 } 1044 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) 1045 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) 1046 1047 /* Description : Interleave right half of byte elements from vectors 1048 Arguments : Inputs - in0, in1, in2, in3 1049 Outputs - out0, out1 1050 Return Type - as per RTYPE 1051 Details : Right half of byte elements of 'in0' and 'in1' are interleaved 1052 and written to out0. 1053 */ 1054 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1055 { \ 1056 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 1057 out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ 1058 } 1059 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) 1060 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) 1061 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) 1062 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) 1063 1064 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1065 out2, out3) \ 1066 { \ 1067 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1068 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1069 } 1070 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) 1071 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) 1072 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) 1073 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) 1074 1075 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ 1076 in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \ 1077 out5, out6, out7) \ 1078 { \ 1079 ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ 1080 out3); \ 1081 ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \ 1082 out6, out7); \ 1083 } 1084 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) 1085 1086 /* Description : Interleave right half of halfword elements from vectors 1087 Arguments : Inputs - in0, in1, in2, in3 1088 Outputs - out0, out1 1089 Return Type - as per RTYPE 1090 Details : Right half of halfword elements of 'in0' and 'in1' are 1091 interleaved and written to 'out0'. 1092 */ 1093 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1094 { \ 1095 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 1096 out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ 1097 } 1098 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) 1099 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) 1100 1101 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1102 out2, out3) \ 1103 { \ 1104 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1105 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1106 } 1107 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) 1108 1109 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1110 { \ 1111 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 1112 out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ 1113 } 1114 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) 1115 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) 1116 1117 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1118 out2, out3) \ 1119 { \ 1120 ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1121 ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1122 } 1123 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) 1124 1125 /* Description : Interleave right half of double word elements from vectors 1126 Arguments : Inputs - in0, in1, in2, in3 1127 Outputs - out0, out1 1128 Return Type - as per RTYPE 1129 Details : Right half of double word elements of 'in0' and 'in1' are 1130 interleaved and written to 'out0'. 1131 */ 1132 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1133 { \ 1134 out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ 1135 out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ 1136 } 1137 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) 1138 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) 1139 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) 1140 1141 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ 1142 { \ 1143 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1144 out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ 1145 } 1146 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) 1147 1148 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1149 out2, out3) \ 1150 { \ 1151 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1152 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1153 } 1154 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) 1155 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) 1156 1157 /* Description : Interleave both left and right half of input vectors 1158 Arguments : Inputs - in0, in1 1159 Outputs - out0, out1 1160 Return Type - as per RTYPE 1161 Details : Right half of byte elements from 'in0' and 'in1' are 1162 interleaved and written to 'out0' 1163 */ 1164 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ 1165 { \ 1166 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 1167 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 1168 } 1169 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) 1170 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) 1171 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) 1172 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) 1173 1174 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ 1175 { \ 1176 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 1177 out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 1178 } 1179 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) 1180 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) 1181 1182 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ 1183 { \ 1184 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 1185 out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 1186 } 1187 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) 1188 #define ILVRL_W2_SB(...) ILVRL_W2(v16i8, __VA_ARGS__) 1189 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) 1190 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) 1191 1192 /* Description : Saturate the halfword element values to the max 1193 unsigned value of (sat_val + 1) bits 1194 The element data width remains unchanged 1195 Arguments : Inputs - in0, in1, sat_val 1196 Outputs - in place operation 1197 Return Type - as per RTYPE 1198 Details : Each unsigned halfword element from 'in0' is saturated to the 1199 value generated with (sat_val + 1) bit range. 1200 The results are written in place 1201 */ 1202 #define SAT_UH2(RTYPE, in0, in1, sat_val) \ 1203 { \ 1204 in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ 1205 in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ 1206 } 1207 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) 1208 1209 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ 1210 { \ 1211 SAT_UH2(RTYPE, in0, in1, sat_val); \ 1212 SAT_UH2(RTYPE, in2, in3, sat_val) \ 1213 } 1214 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) 1215 1216 /* Description : Saturate the halfword element values to the max 1217 unsigned value of (sat_val + 1) bits 1218 The element data width remains unchanged 1219 Arguments : Inputs - in0, in1, sat_val 1220 Outputs - in place operation 1221 Return Type - as per RTYPE 1222 Details : Each unsigned halfword element from 'in0' is saturated to the 1223 value generated with (sat_val + 1) bit range 1224 The results are written in place 1225 */ 1226 #define SAT_SH2(RTYPE, in0, in1, sat_val) \ 1227 { \ 1228 in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ 1229 in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ 1230 } 1231 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) 1232 1233 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ 1234 { \ 1235 SAT_SH2(RTYPE, in0, in1, sat_val); \ 1236 SAT_SH2(RTYPE, in2, in3, sat_val); \ 1237 } 1238 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) 1239 1240 /* Description : Indexed halfword element values are replicated to all 1241 elements in output vector 1242 Arguments : Inputs - in, idx0, idx1 1243 Outputs - out0, out1 1244 Return Type - as per RTYPE 1245 Details : 'idx0' element value from 'in' vector is replicated to all 1246 elements in 'out0' vector 1247 Valid index range for halfword operation is 0-7 1248 */ 1249 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ 1250 { \ 1251 out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ 1252 out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ 1253 } 1254 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) 1255 1256 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \ 1257 { \ 1258 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ 1259 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ 1260 } 1261 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) 1262 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) 1263 1264 /* Description : Pack even byte elements of vector pairs 1265 Arguments : Inputs - in0, in1, in2, in3 1266 Outputs - out0, out1 1267 Return Type - as per RTYPE 1268 Details : Even byte elements of 'in0' are copied to the left half of 1269 'out0' & even byte elements of 'in1' are copied to the right 1270 half of 'out0'. 1271 */ 1272 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1273 { \ 1274 out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ 1275 out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ 1276 } 1277 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) 1278 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) 1279 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) 1280 1281 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1282 out2, out3) \ 1283 { \ 1284 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1285 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1286 } 1287 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) 1288 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) 1289 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) 1290 1291 /* Description : Pack even halfword elements of vector pairs 1292 Arguments : Inputs - in0, in1, in2, in3 1293 Outputs - out0, out1 1294 Return Type - as per RTYPE 1295 Details : Even halfword elements of 'in0' are copied to the left half of 1296 'out0' & even halfword elements of 'in1' are copied to the 1297 right half of 'out0'. 1298 */ 1299 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1300 { \ 1301 out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ 1302 out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ 1303 } 1304 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) 1305 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) 1306 1307 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1308 out2, out3) \ 1309 { \ 1310 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1311 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1312 } 1313 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) 1314 1315 /* Description : Pack even double word elements of vector pairs 1316 Arguments : Inputs - in0, in1, in2, in3 1317 Outputs - out0, out1 1318 Return Type - as per RTYPE 1319 Details : Even double elements of 'in0' are copied to the left half of 1320 'out0' & even double elements of 'in1' are copied to the right 1321 half of 'out0'. 1322 */ 1323 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1324 { \ 1325 out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ 1326 out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ 1327 } 1328 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) 1329 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) 1330 1331 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1332 out2, out3) \ 1333 { \ 1334 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1335 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1336 } 1337 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) 1338 1339 /* Description : Each byte element is logically xor'ed with immediate 128 1340 Arguments : Inputs - in0, in1 1341 Outputs - in place operation 1342 Return Type - as per RTYPE 1343 Details : Each unsigned byte element from input vector 'in0' is 1344 logically xor'ed with 128 and the result is stored in-place. 1345 */ 1346 #define XORI_B2_128(RTYPE, in0, in1) \ 1347 { \ 1348 in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ 1349 in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ 1350 } 1351 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) 1352 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) 1353 1354 #define XORI_B3_128(RTYPE, in0, in1, in2) \ 1355 { \ 1356 XORI_B2_128(RTYPE, in0, in1); \ 1357 in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ 1358 } 1359 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) 1360 1361 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ 1362 { \ 1363 XORI_B2_128(RTYPE, in0, in1); \ 1364 XORI_B2_128(RTYPE, in2, in3); \ 1365 } 1366 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) 1367 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) 1368 1369 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \ 1370 { \ 1371 XORI_B4_128(RTYPE, in0, in1, in2, in3); \ 1372 XORI_B3_128(RTYPE, in4, in5, in6); \ 1373 } 1374 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) 1375 1376 /* Description : Average of signed halfword elements -> (a + b) / 2 1377 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1378 Outputs - out0, out1, out2, out3 1379 Return Type - as per RTYPE 1380 Details : Each signed halfword element from 'in0' is added to each 1381 signed halfword element of 'in1' with full precision resulting 1382 in one extra bit in the result. The result is then divided by 1383 2 and written to 'out0' 1384 */ 1385 #define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1386 out2, out3) \ 1387 { \ 1388 out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ 1389 out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ 1390 out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ 1391 out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ 1392 } 1393 #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) 1394 1395 /* Description : Addition of signed halfword elements and signed saturation 1396 Arguments : Inputs - in0, in1, in2, in3 1397 Outputs - out0, out1 1398 Return Type - as per RTYPE 1399 Details : Signed halfword elements from 'in0' are added to signed 1400 halfword elements of 'in1'. The result is then signed saturated 1401 between halfword data type range 1402 */ 1403 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1404 { \ 1405 out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ 1406 out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ 1407 } 1408 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) 1409 1410 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1411 out2, out3) \ 1412 { \ 1413 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1414 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1415 } 1416 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) 1417 1418 /* Description : Shift left all elements of vector (generic for all data types) 1419 Arguments : Inputs - in0, in1, in2, in3, shift 1420 Outputs - in place operation 1421 Return Type - as per input vector RTYPE 1422 Details : Each element of vector 'in0' is left shifted by 'shift' and 1423 the result is written in-place. 1424 */ 1425 #define SLLI_4V(in0, in1, in2, in3, shift) \ 1426 { \ 1427 in0 = in0 << shift; \ 1428 in1 = in1 << shift; \ 1429 in2 = in2 << shift; \ 1430 in3 = in3 << shift; \ 1431 } 1432 1433 /* Description : Arithmetic shift right all elements of vector 1434 (generic for all data types) 1435 Arguments : Inputs - in0, in1, in2, in3, shift 1436 Outputs - in place operation 1437 Return Type - as per input vector RTYPE 1438 Details : Each element of vector 'in0' is right shifted by 'shift' and 1439 the result is written in-place. 'shift' is a GP variable. 1440 */ 1441 #define SRA_2V(in0, in1, shift) \ 1442 { \ 1443 in0 = in0 >> shift; \ 1444 in1 = in1 >> shift; \ 1445 } 1446 1447 #define SRA_4V(in0, in1, in2, in3, shift) \ 1448 { \ 1449 in0 = in0 >> shift; \ 1450 in1 = in1 >> shift; \ 1451 in2 = in2 >> shift; \ 1452 in3 = in3 >> shift; \ 1453 } 1454 1455 /* Description : Shift right arithmetic rounded words 1456 Arguments : Inputs - in0, in1, shift 1457 Outputs - in place operation 1458 Return Type - as per RTYPE 1459 Details : Each element of vector 'in0' is shifted right arithmetically by 1460 the number of bits in the corresponding element in the vector 1461 'shift'. The last discarded bit is added to shifted value for 1462 rounding and the result is written in-place. 1463 'shift' is a vector. 1464 */ 1465 #define SRAR_W2(RTYPE, in0, in1, shift) \ 1466 { \ 1467 in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ 1468 in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ 1469 } 1470 1471 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ 1472 { \ 1473 SRAR_W2(RTYPE, in0, in1, shift) \ 1474 SRAR_W2(RTYPE, in2, in3, shift) \ 1475 } 1476 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) 1477 1478 /* Description : Shift right arithmetic rounded (immediate) 1479 Arguments : Inputs - in0, in1, shift 1480 Outputs - in place operation 1481 Return Type - as per RTYPE 1482 Details : Each element of vector 'in0' is shifted right arithmetically by 1483 the value in 'shift'. The last discarded bit is added to the 1484 shifted value for rounding and the result is written in-place. 1485 'shift' is an immediate value. 1486 */ 1487 #define SRARI_H2(RTYPE, in0, in1, shift) \ 1488 { \ 1489 in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ 1490 in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ 1491 } 1492 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) 1493 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) 1494 1495 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ 1496 { \ 1497 SRARI_H2(RTYPE, in0, in1, shift); \ 1498 SRARI_H2(RTYPE, in2, in3, shift); \ 1499 } 1500 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) 1501 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) 1502 1503 #define SRARI_W2(RTYPE, in0, in1, shift) \ 1504 { \ 1505 in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ 1506 in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ 1507 } 1508 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) 1509 1510 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ 1511 { \ 1512 SRARI_W2(RTYPE, in0, in1, shift); \ 1513 SRARI_W2(RTYPE, in2, in3, shift); \ 1514 } 1515 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) 1516 1517 /* Description : Logical shift right all elements of vector (immediate) 1518 Arguments : Inputs - in0, in1, in2, in3, shift 1519 Outputs - out0, out1, out2, out3 1520 Return Type - as per RTYPE 1521 Details : Each element of vector 'in0' is right shifted by 'shift' and 1522 the result is written in-place. 'shift' is an immediate value. 1523 */ 1524 #define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \ 1525 { \ 1526 out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ 1527 out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ 1528 out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ 1529 out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ 1530 } 1531 #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) 1532 1533 /* Description : Multiplication of pairs of vectors 1534 Arguments : Inputs - in0, in1, in2, in3 1535 Outputs - out0, out1 1536 Details : Each element from 'in0' is multiplied with elements from 'in1' 1537 and the result is written to 'out0' 1538 */ 1539 #define MUL2(in0, in1, in2, in3, out0, out1) \ 1540 { \ 1541 out0 = in0 * in1; \ 1542 out1 = in2 * in3; \ 1543 } 1544 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1545 { \ 1546 MUL2(in0, in1, in2, in3, out0, out1); \ 1547 MUL2(in4, in5, in6, in7, out2, out3); \ 1548 } 1549 1550 /* Description : Addition of 2 pairs of vectors 1551 Arguments : Inputs - in0, in1, in2, in3 1552 Outputs - out0, out1 1553 Details : Each element in 'in0' is added to 'in1' and result is written 1554 to 'out0'. 1555 */ 1556 #define ADD2(in0, in1, in2, in3, out0, out1) \ 1557 { \ 1558 out0 = in0 + in1; \ 1559 out1 = in2 + in3; \ 1560 } 1561 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1562 { \ 1563 ADD2(in0, in1, in2, in3, out0, out1); \ 1564 ADD2(in4, in5, in6, in7, out2, out3); \ 1565 } 1566 1567 /* Description : Subtraction of 2 pairs of vectors 1568 Arguments : Inputs - in0, in1, in2, in3 1569 Outputs - out0, out1 1570 Details : Each element in 'in1' is subtracted from 'in0' and result is 1571 written to 'out0'. 1572 */ 1573 #define SUB2(in0, in1, in2, in3, out0, out1) \ 1574 { \ 1575 out0 = in0 - in1; \ 1576 out1 = in2 - in3; \ 1577 } 1578 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1579 { \ 1580 out0 = in0 - in1; \ 1581 out1 = in2 - in3; \ 1582 out2 = in4 - in5; \ 1583 out3 = in6 - in7; \ 1584 } 1585 1586 /* Description : Sign extend halfword elements from right half of the vector 1587 Arguments : Input - in (halfword vector) 1588 Output - out (sign extended word vector) 1589 Return Type - signed word 1590 Details : Sign bit of halfword elements from input vector 'in' is 1591 extracted and interleaved with same vector 'in0' to generate 1592 4 word elements keeping sign intact 1593 */ 1594 #define UNPCK_R_SH_SW(in, out) \ 1595 { \ 1596 v8i16 sign_m; \ 1597 \ 1598 sign_m = __msa_clti_s_h((v8i16)in, 0); \ 1599 out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ 1600 } 1601 1602 /* Description : Sign extend byte elements from input vector and return 1603 halfword results in pair of vectors 1604 Arguments : Input - in (byte vector) 1605 Outputs - out0, out1 (sign extended halfword vectors) 1606 Return Type - signed halfword 1607 Details : Sign bit of byte elements from input vector 'in' is 1608 extracted and interleaved right with same vector 'in0' to 1609 generate 8 signed halfword elements in 'out0' 1610 Then interleaved left with same vector 'in0' to 1611 generate 8 signed halfword elements in 'out1' 1612 */ 1613 #define UNPCK_SB_SH(in, out0, out1) \ 1614 { \ 1615 v16i8 tmp_m; \ 1616 \ 1617 tmp_m = __msa_clti_s_b((v16i8)in, 0); \ 1618 ILVRL_B2_SH(tmp_m, in, out0, out1); \ 1619 } 1620 1621 /* Description : Zero extend unsigned byte elements to halfword elements 1622 Arguments : Input - in (unsigned byte vector) 1623 Outputs - out0, out1 (unsigned halfword vectors) 1624 Return Type - signed halfword 1625 Details : Zero extended right half of vector is returned in 'out0' 1626 Zero extended left half of vector is returned in 'out1' 1627 */ 1628 #define UNPCK_UB_SH(in, out0, out1) \ 1629 { \ 1630 v16i8 zero_m = { 0 }; \ 1631 \ 1632 ILVRL_B2_SH(zero_m, in, out0, out1); \ 1633 } 1634 1635 /* Description : Sign extend halfword elements from input vector and return 1636 the result in pair of vectors 1637 Arguments : Input - in (halfword vector) 1638 Outputs - out0, out1 (sign extended word vectors) 1639 Return Type - signed word 1640 Details : Sign bit of halfword elements from input vector 'in' is 1641 extracted and interleaved right with same vector 'in0' to 1642 generate 4 signed word elements in 'out0' 1643 Then interleaved left with same vector 'in0' to 1644 generate 4 signed word elements in 'out1' 1645 */ 1646 #define UNPCK_SH_SW(in, out0, out1) \ 1647 { \ 1648 v8i16 tmp_m; \ 1649 \ 1650 tmp_m = __msa_clti_s_h((v8i16)in, 0); \ 1651 ILVRL_H2_SW(tmp_m, in, out0, out1); \ 1652 } 1653 1654 /* Description : Butterfly of 4 input vectors 1655 Arguments : Inputs - in0, in1, in2, in3 1656 Outputs - out0, out1, out2, out3 1657 Details : Butterfly operation 1658 */ 1659 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ 1660 { \ 1661 out0 = in0 + in3; \ 1662 out1 = in1 + in2; \ 1663 \ 1664 out2 = in1 - in2; \ 1665 out3 = in0 - in3; \ 1666 } 1667 1668 /* Description : Butterfly of 8 input vectors 1669 Arguments : Inputs - in0 ... in7 1670 Outputs - out0 .. out7 1671 Details : Butterfly operation 1672 */ 1673 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ 1674 out3, out4, out5, out6, out7) \ 1675 { \ 1676 out0 = in0 + in7; \ 1677 out1 = in1 + in6; \ 1678 out2 = in2 + in5; \ 1679 out3 = in3 + in4; \ 1680 \ 1681 out4 = in3 - in4; \ 1682 out5 = in2 - in5; \ 1683 out6 = in1 - in6; \ 1684 out7 = in0 - in7; \ 1685 } 1686 1687 /* Description : Butterfly of 16 input vectors 1688 Arguments : Inputs - in0 ... in15 1689 Outputs - out0 .. out15 1690 Details : Butterfly operation 1691 */ 1692 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ 1693 in11, in12, in13, in14, in15, out0, out1, out2, out3, \ 1694 out4, out5, out6, out7, out8, out9, out10, out11, out12, \ 1695 out13, out14, out15) \ 1696 { \ 1697 out0 = in0 + in15; \ 1698 out1 = in1 + in14; \ 1699 out2 = in2 + in13; \ 1700 out3 = in3 + in12; \ 1701 out4 = in4 + in11; \ 1702 out5 = in5 + in10; \ 1703 out6 = in6 + in9; \ 1704 out7 = in7 + in8; \ 1705 \ 1706 out8 = in7 - in8; \ 1707 out9 = in6 - in9; \ 1708 out10 = in5 - in10; \ 1709 out11 = in4 - in11; \ 1710 out12 = in3 - in12; \ 1711 out13 = in2 - in13; \ 1712 out14 = in1 - in14; \ 1713 out15 = in0 - in15; \ 1714 } 1715 1716 /* Description : Transpose input 8x8 byte block 1717 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1718 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1719 Return Type - as per RTYPE 1720 */ 1721 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ 1722 out1, out2, out3, out4, out5, out6, out7) \ 1723 { \ 1724 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1725 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1726 \ 1727 ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \ 1728 tmp3_m); \ 1729 ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ 1730 ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ 1731 ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ 1732 ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ 1733 SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ 1734 SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ 1735 } 1736 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) 1737 1738 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors 1739 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 1740 in8, in9, in10, in11, in12, in13, in14, in15 1741 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1742 Return Type - unsigned byte 1743 */ 1744 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ 1745 in10, in11, in12, in13, in14, in15, out0, out1, \ 1746 out2, out3, out4, out5, out6, out7) \ 1747 { \ 1748 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1749 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1750 \ 1751 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ 1752 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ 1753 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ 1754 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ 1755 \ 1756 tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ 1757 tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ 1758 tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ 1759 tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ 1760 out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ 1761 tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ 1762 out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ 1763 tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ 1764 \ 1765 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ 1766 out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1767 out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1768 \ 1769 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ 1770 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ 1771 out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1772 out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1773 \ 1774 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ 1775 out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1776 out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1777 \ 1778 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ 1779 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ 1780 out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1781 out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1782 } 1783 1784 /* Description : Transpose 4x4 block with half word elements in vectors 1785 Arguments : Inputs - in0, in1, in2, in3 1786 Outputs - out0, out1, out2, out3 1787 Return Type - signed halfword 1788 */ 1789 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ 1790 { \ 1791 v8i16 s0_m, s1_m; \ 1792 \ 1793 ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ 1794 ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ 1795 out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ 1796 out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ 1797 } 1798 1799 /* Description : Transpose 4x8 block with half word elements in vectors 1800 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1801 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1802 Return Type - signed halfword 1803 */ 1804 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1805 out2, out3, out4, out5, out6, out7) \ 1806 { \ 1807 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1808 v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ 1809 v8i16 zero_m = { 0 }; \ 1810 \ 1811 ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \ 1812 tmp3_n); \ 1813 ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ 1814 ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ 1815 \ 1816 out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ 1817 out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ 1818 out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ 1819 out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ 1820 \ 1821 out4 = zero_m; \ 1822 out5 = zero_m; \ 1823 out6 = zero_m; \ 1824 out7 = zero_m; \ 1825 } 1826 1827 /* Description : Transpose 8x4 block with half word elements in vectors 1828 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1829 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1830 Return Type - signed halfword 1831 */ 1832 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ 1833 { \ 1834 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1835 \ 1836 ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ 1837 ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ 1838 ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ 1839 ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ 1840 } 1841 1842 /* Description : Transpose 8x8 block with half word elements in vectors 1843 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1844 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1845 Return Type - as per RTYPE 1846 */ 1847 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ 1848 out1, out2, out3, out4, out5, out6, out7) \ 1849 { \ 1850 v8i16 s0_m, s1_m; \ 1851 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1852 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1853 \ 1854 ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ 1855 ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ 1856 ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ 1857 ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ 1858 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 1859 ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ 1860 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 1861 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ 1862 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \ 1863 tmp7_m, out0, out2, out4, out6); \ 1864 out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ 1865 out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ 1866 out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ 1867 out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ 1868 } 1869 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) 1870 1871 /* Description : Transpose 4x4 block with word elements in vectors 1872 Arguments : Inputs - in0, in1, in2, in3 1873 Outputs - out0, out1, out2, out3 1874 Return Type - signed word 1875 */ 1876 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ 1877 { \ 1878 v4i32 s0_m, s1_m, s2_m, s3_m; \ 1879 \ 1880 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ 1881 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ 1882 \ 1883 out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ 1884 out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ 1885 out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ 1886 out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ 1887 } 1888 1889 /* Description : Add block 4x4 1890 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 1891 Details : Least significant 4 bytes from each input vector are added to 1892 the destination bytes, clipped between 0-255 and stored. 1893 */ 1894 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ 1895 { \ 1896 uint32_t src0_m, src1_m, src2_m, src3_m; \ 1897 v8i16 inp0_m, inp1_m, res0_m, res1_m; \ 1898 v16i8 dst0_m = { 0 }; \ 1899 v16i8 dst1_m = { 0 }; \ 1900 v16i8 zero_m = { 0 }; \ 1901 \ 1902 ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ 1903 LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ 1904 INSERT_W2_SB(src0_m, src1_m, dst0_m); \ 1905 INSERT_W2_SB(src2_m, src3_m, dst1_m); \ 1906 ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ 1907 ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ 1908 CLIP_SH2_0_255(res0_m, res1_m); \ 1909 PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ 1910 ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ 1911 } 1912 1913 /* Description : Pack even elements of input vectors & xor with 128 1914 Arguments : Inputs - in0, in1 1915 Output - out_m 1916 Return Type - unsigned byte 1917 Details : Signed byte even elements from 'in0' and 'in1' are packed 1918 together in one vector and the resulting vector is xor'ed with 1919 128 to shift the range from signed to unsigned byte 1920 */ 1921 #define PCKEV_XORI128_UB(in0, in1) \ 1922 ({ \ 1923 v16u8 out_m; \ 1924 \ 1925 out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ 1926 out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ 1927 out_m; \ 1928 }) 1929 1930 /* Description : Converts inputs to unsigned bytes, interleave, average & store 1931 as 8x4 unsigned byte block 1932 Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride 1933 */ 1934 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \ 1935 { \ 1936 v16u8 tmp0_m, tmp1_m; \ 1937 uint8_t *pdst_m = (uint8_t *)(pdst); \ 1938 \ 1939 tmp0_m = PCKEV_XORI128_UB(in0, in1); \ 1940 tmp1_m = PCKEV_XORI128_UB(in2, in3); \ 1941 AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ 1942 ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ 1943 } 1944 1945 /* Description : Pack even byte elements and store byte vector in destination 1946 memory 1947 Arguments : Inputs - in0, in1, pdst 1948 */ 1949 #define PCKEV_ST_SB(in0, in1, pdst) \ 1950 { \ 1951 v16i8 tmp_m; \ 1952 \ 1953 tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ 1954 ST_SB(tmp_m, (pdst)); \ 1955 } 1956 1957 /* Description : Horizontal 2 tap filter kernel code 1958 Arguments : Inputs - in0, in1, mask, coeff, shift 1959 */ 1960 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ 1961 ({ \ 1962 v16i8 tmp0_m; \ 1963 v8u16 tmp1_m; \ 1964 \ 1965 tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ 1966 tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ 1967 tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ 1968 \ 1969 tmp1_m; \ 1970 }) 1971 #endif // VPX_VPX_DSP_MIPS_MACROS_MSA_H_ 1972