1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_VPX_DSP_MIPS_MACROS_MSA_H_ 12 #define VPX_VPX_DSP_MIPS_MACROS_MSA_H_ 13 14 #include <msa.h> 15 16 #include "./vpx_config.h" 17 #include "vpx/vpx_integer.h" 18 19 #define LD_V(RTYPE, psrc) *((const RTYPE *)(psrc)) 20 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__) 21 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__) 22 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__) 23 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__) 24 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__) 25 26 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 27 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__) 28 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__) 29 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__) 30 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__) 31 32 #if (__mips_isa_rev >= 6) 33 #define LH(psrc) \ 34 ({ \ 35 uint16_t val_lh_m = *(const uint16_t *)(psrc); \ 36 val_lh_m; \ 37 }) 38 39 #define LW(psrc) \ 40 ({ \ 41 uint32_t val_lw_m = *(const uint32_t *)(psrc); \ 42 val_lw_m; \ 43 }) 44 45 #if (__mips == 64) 46 #define LD(psrc) \ 47 ({ \ 48 uint64_t val_ld_m = *(const uint64_t *)(psrc); \ 49 val_ld_m; \ 50 }) 51 #else // !(__mips == 64) 52 #define LD(psrc) \ 53 ({ \ 54 const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ 55 uint32_t val0_ld_m, val1_ld_m; \ 56 uint64_t val_ld_m = 0; \ 57 \ 58 val0_ld_m = LW(psrc_ld_m); \ 59 val1_ld_m = LW(psrc_ld_m + 4); \ 60 \ 61 val_ld_m = (uint64_t)(val1_ld_m); \ 62 val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \ 63 val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m); \ 64 \ 65 val_ld_m; \ 66 }) 67 #endif // (__mips == 64) 68 69 #define SH(val, pdst) *(uint16_t *)(pdst) = (val); 70 #define SW(val, pdst) *(uint32_t *)(pdst) = (val); 71 #define SD(val, pdst) *(uint64_t *)(pdst) = (val); 72 #else // !(__mips_isa_rev >= 6) 73 #define LH(psrc) \ 74 ({ \ 75 const uint8_t *psrc_lh_m = (const uint8_t *)(psrc); \ 76 uint16_t val_lh_m; \ 77 \ 78 __asm__ __volatile__("ulh %[val_lh_m], %[psrc_lh_m] \n\t" \ 79 \ 80 : [val_lh_m] "=r"(val_lh_m) \ 81 : [psrc_lh_m] "m"(*psrc_lh_m)); \ 82 \ 83 val_lh_m; \ 84 }) 85 86 #define LW(psrc) \ 87 ({ \ 88 const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \ 89 uint32_t val_lw_m; \ 90 \ 91 __asm__ __volatile__("ulw %[val_lw_m], %[psrc_lw_m] \n\t" \ 92 \ 93 : [val_lw_m] "=r"(val_lw_m) \ 94 : [psrc_lw_m] "m"(*psrc_lw_m)); \ 95 \ 96 val_lw_m; \ 97 }) 98 99 #if (__mips == 64) 100 #define LD(psrc) \ 101 ({ \ 102 const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ 103 uint64_t val_ld_m = 0; \ 104 \ 105 __asm__ __volatile__("uld %[val_ld_m], %[psrc_ld_m] \n\t" \ 106 \ 107 : [val_ld_m] "=r"(val_ld_m) \ 108 : [psrc_ld_m] "m"(*psrc_ld_m)); \ 109 \ 110 val_ld_m; \ 111 }) 112 #else // !(__mips == 64) 113 #define LD(psrc) \ 114 ({ \ 115 const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ 116 uint32_t val0_ld_m, val1_ld_m; \ 117 uint64_t val_ld_m = 0; \ 118 \ 119 val0_ld_m = LW(psrc_ld_m); \ 120 val1_ld_m = LW(psrc_ld_m + 4); \ 121 \ 122 val_ld_m = (uint64_t)(val1_ld_m); \ 123 val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \ 124 val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m); \ 125 \ 126 val_ld_m; \ 127 }) 128 #endif // (__mips == 64) 129 130 #define SH(val, pdst) \ 131 { \ 132 uint8_t *pdst_sh_m = (uint8_t *)(pdst); \ 133 const uint16_t val_sh_m = (val); \ 134 \ 135 __asm__ __volatile__("ush %[val_sh_m], %[pdst_sh_m] \n\t" \ 136 \ 137 : [pdst_sh_m] "=m"(*pdst_sh_m) \ 138 : [val_sh_m] "r"(val_sh_m)); \ 139 } 140 141 #define SW(val, pdst) \ 142 { \ 143 uint8_t *pdst_sw_m = (uint8_t *)(pdst); \ 144 const uint32_t val_sw_m = (val); \ 145 \ 146 __asm__ __volatile__("usw %[val_sw_m], %[pdst_sw_m] \n\t" \ 147 \ 148 : [pdst_sw_m] "=m"(*pdst_sw_m) \ 149 : [val_sw_m] "r"(val_sw_m)); \ 150 } 151 152 #define SD(val, pdst) \ 153 { \ 154 uint8_t *pdst_sd_m = (uint8_t *)(pdst); \ 155 uint32_t val0_sd_m, val1_sd_m; \ 156 \ 157 val0_sd_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ 158 val1_sd_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ 159 \ 160 SW(val0_sd_m, pdst_sd_m); \ 161 SW(val1_sd_m, pdst_sd_m + 4); \ 162 } 163 #endif // (__mips_isa_rev >= 6) 164 165 /* Description : Load 4 words with stride 166 Arguments : Inputs - psrc, stride 167 Outputs - out0, out1, out2, out3 168 Details : Load word in 'out0' from (psrc) 169 Load word in 'out1' from (psrc + stride) 170 Load word in 'out2' from (psrc + 2 * stride) 171 Load word in 'out3' from (psrc + 3 * stride) 172 */ 173 #define LW4(psrc, stride, out0, out1, out2, out3) \ 174 { \ 175 out0 = LW((psrc)); \ 176 out1 = LW((psrc) + stride); \ 177 out2 = LW((psrc) + 2 * stride); \ 178 out3 = LW((psrc) + 3 * stride); \ 179 } 180 181 /* Description : Load double words with stride 182 Arguments : Inputs - psrc, stride 183 Outputs - out0, out1 184 Details : Load double word in 'out0' from (psrc) 185 Load double word in 'out1' from (psrc + stride) 186 */ 187 #define LD2(psrc, stride, out0, out1) \ 188 { \ 189 out0 = LD((psrc)); \ 190 out1 = LD((psrc) + stride); \ 191 } 192 #define LD4(psrc, stride, out0, out1, out2, out3) \ 193 { \ 194 LD2((psrc), stride, out0, out1); \ 195 LD2((psrc) + 2 * stride, stride, out2, out3); \ 196 } 197 198 /* Description : Store 4 words with stride 199 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 200 Details : Store word from 'in0' to (pdst) 201 Store word from 'in1' to (pdst + stride) 202 Store word from 'in2' to (pdst + 2 * stride) 203 Store word from 'in3' to (pdst + 3 * stride) 204 */ 205 #define SW4(in0, in1, in2, in3, pdst, stride) \ 206 { \ 207 SW(in0, (pdst)) \ 208 SW(in1, (pdst) + stride); \ 209 SW(in2, (pdst) + 2 * stride); \ 210 SW(in3, (pdst) + 3 * stride); \ 211 } 212 213 /* Description : Store 4 double words with stride 214 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 215 Details : Store double word from 'in0' to (pdst) 216 Store double word from 'in1' to (pdst + stride) 217 Store double word from 'in2' to (pdst + 2 * stride) 218 Store double word from 'in3' to (pdst + 3 * stride) 219 */ 220 #define SD4(in0, in1, in2, in3, pdst, stride) \ 221 { \ 222 SD(in0, (pdst)) \ 223 SD(in1, (pdst) + stride); \ 224 SD(in2, (pdst) + 2 * stride); \ 225 SD(in3, (pdst) + 3 * stride); \ 226 } 227 228 /* Description : Load vector elements with stride 229 Arguments : Inputs - psrc, stride 230 Outputs - out0, out1 231 Return Type - as per RTYPE 232 Details : Load 16 byte elements in 'out0' from (psrc) 233 Load 16 byte elements in 'out1' from (psrc + stride) 234 */ 235 #define LD_V2(RTYPE, psrc, stride, out0, out1) \ 236 { \ 237 out0 = LD_V(RTYPE, (psrc)); \ 238 out1 = LD_V(RTYPE, (psrc) + stride); \ 239 } 240 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__) 241 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__) 242 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__) 243 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__) 244 245 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \ 246 { \ 247 LD_V2(RTYPE, (psrc), stride, out0, out1); \ 248 out2 = LD_V(RTYPE, (psrc) + 2 * stride); \ 249 } 250 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__) 251 252 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 253 { \ 254 LD_V2(RTYPE, (psrc), stride, out0, out1); \ 255 LD_V2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ 256 } 257 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__) 258 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__) 259 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__) 260 261 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ 262 { \ 263 LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 264 out4 = LD_V(RTYPE, (psrc) + 4 * stride); \ 265 } 266 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__) 267 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__) 268 269 #define LD_V7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ 270 { \ 271 LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ 272 LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ 273 } 274 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__) 275 276 #define LD_V8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ 277 out7) \ 278 { \ 279 LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 280 LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ 281 } 282 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__) 283 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__) 284 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__) 285 286 #define LD_V16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ 287 out7, out8, out9, out10, out11, out12, out13, out14, out15) \ 288 { \ 289 LD_V8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ 290 out7); \ 291 LD_V8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ 292 out13, out14, out15); \ 293 } 294 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__) 295 296 /* Description : Load 4x4 block of signed halfword elements from 1D source 297 data into 4 vectors (Each vector with 4 signed halfwords) 298 Arguments : Input - psrc 299 Outputs - out0, out1, out2, out3 300 */ 301 #define LD4x4_SH(psrc, out0, out1, out2, out3) \ 302 { \ 303 out0 = LD_SH(psrc); \ 304 out2 = LD_SH(psrc + 8); \ 305 out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ 306 out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ 307 } 308 309 /* Description : Store vectors with stride 310 Arguments : Inputs - in0, in1, pdst, stride 311 Details : Store 16 byte elements from 'in0' to (pdst) 312 Store 16 byte elements from 'in1' to (pdst + stride) 313 */ 314 #define ST_V2(RTYPE, in0, in1, pdst, stride) \ 315 { \ 316 ST_V(RTYPE, in0, (pdst)); \ 317 ST_V(RTYPE, in1, (pdst) + stride); \ 318 } 319 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__) 320 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__) 321 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__) 322 323 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \ 324 { \ 325 ST_V2(RTYPE, in0, in1, (pdst), stride); \ 326 ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 327 } 328 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__) 329 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__) 330 331 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 332 { \ 333 ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride); \ 334 ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ 335 } 336 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__) 337 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__) 338 339 /* Description : Store 2x4 byte block to destination memory from input vector 340 Arguments : Inputs - in, stidx, pdst, stride 341 Details : Index 'stidx' halfword element from 'in' vector is copied to 342 the GP register and stored to (pdst) 343 Index 'stidx+1' halfword element from 'in' vector is copied to 344 the GP register and stored to (pdst + stride) 345 Index 'stidx+2' halfword element from 'in' vector is copied to 346 the GP register and stored to (pdst + 2 * stride) 347 Index 'stidx+3' halfword element from 'in' vector is copied to 348 the GP register and stored to (pdst + 3 * stride) 349 */ 350 #define ST2x4_UB(in, stidx, pdst, stride) \ 351 { \ 352 uint16_t out0_m, out1_m, out2_m, out3_m; \ 353 uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ 354 \ 355 out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ 356 out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ 357 out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ 358 out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ 359 \ 360 SH(out0_m, pblk_2x4_m); \ 361 SH(out1_m, pblk_2x4_m + stride); \ 362 SH(out2_m, pblk_2x4_m + 2 * stride); \ 363 SH(out3_m, pblk_2x4_m + 3 * stride); \ 364 } 365 366 /* Description : Store 4x2 byte block to destination memory from input vector 367 Arguments : Inputs - in, pdst, stride 368 Details : Index 0 word element from 'in' vector is copied to the GP 369 register and stored to (pdst) 370 Index 1 word element from 'in' vector is copied to the GP 371 register and stored to (pdst + stride) 372 */ 373 #define ST4x2_UB(in, pdst, stride) \ 374 { \ 375 uint32_t out0_m, out1_m; \ 376 uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ 377 \ 378 out0_m = __msa_copy_u_w((v4i32)in, 0); \ 379 out1_m = __msa_copy_u_w((v4i32)in, 1); \ 380 \ 381 SW(out0_m, pblk_4x2_m); \ 382 SW(out1_m, pblk_4x2_m + stride); \ 383 } 384 385 /* Description : Store 4x4 byte block to destination memory from input vector 386 Arguments : Inputs - in0, in1, pdst, stride 387 Details : 'Idx0' word element from input vector 'in0' is copied to the 388 GP register and stored to (pdst) 389 'Idx1' word element from input vector 'in0' is copied to the 390 GP register and stored to (pdst + stride) 391 'Idx2' word element from input vector 'in0' is copied to the 392 GP register and stored to (pdst + 2 * stride) 393 'Idx3' word element from input vector 'in0' is copied to the 394 GP register and stored to (pdst + 3 * stride) 395 */ 396 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ 397 { \ 398 uint32_t out0_m, out1_m, out2_m, out3_m; \ 399 uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ 400 \ 401 out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ 402 out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ 403 out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ 404 out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ 405 \ 406 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ 407 } 408 #define ST4x8_UB(in0, in1, pdst, stride) \ 409 { \ 410 uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ 411 \ 412 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ 413 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ 414 } 415 416 /* Description : Store 8x1 byte block to destination memory from input vector 417 Arguments : Inputs - in, pdst 418 Details : Index 0 double word element from 'in' vector is copied to the 419 GP register and stored to (pdst) 420 */ 421 #define ST8x1_UB(in, pdst) \ 422 { \ 423 uint64_t out0_m; \ 424 \ 425 out0_m = __msa_copy_u_d((v2i64)in, 0); \ 426 SD(out0_m, pdst); \ 427 } 428 429 /* Description : Store 8x2 byte block to destination memory from input vector 430 Arguments : Inputs - in, pdst, stride 431 Details : Index 0 double word element from 'in' vector is copied to the 432 GP register and stored to (pdst) 433 Index 1 double word element from 'in' vector is copied to the 434 GP register and stored to (pdst + stride) 435 */ 436 #define ST8x2_UB(in, pdst, stride) \ 437 { \ 438 uint64_t out0_m, out1_m; \ 439 uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ 440 \ 441 out0_m = __msa_copy_u_d((v2i64)in, 0); \ 442 out1_m = __msa_copy_u_d((v2i64)in, 1); \ 443 \ 444 SD(out0_m, pblk_8x2_m); \ 445 SD(out1_m, pblk_8x2_m + stride); \ 446 } 447 448 /* Description : Store 8x4 byte block to destination memory from input 449 vectors 450 Arguments : Inputs - in0, in1, pdst, stride 451 Details : Index 0 double word element from 'in0' vector is copied to the 452 GP register and stored to (pdst) 453 Index 1 double word element from 'in0' vector is copied to the 454 GP register and stored to (pdst + stride) 455 Index 0 double word element from 'in1' vector is copied to the 456 GP register and stored to (pdst + 2 * stride) 457 Index 1 double word element from 'in1' vector is copied to the 458 GP register and stored to (pdst + 3 * stride) 459 */ 460 #define ST8x4_UB(in0, in1, pdst, stride) \ 461 { \ 462 uint64_t out0_m, out1_m, out2_m, out3_m; \ 463 uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ 464 \ 465 out0_m = __msa_copy_u_d((v2i64)in0, 0); \ 466 out1_m = __msa_copy_u_d((v2i64)in0, 1); \ 467 out2_m = __msa_copy_u_d((v2i64)in1, 0); \ 468 out3_m = __msa_copy_u_d((v2i64)in1, 1); \ 469 \ 470 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ 471 } 472 473 /* Description : average with rounding (in0 + in1 + 1) / 2. 474 Arguments : Inputs - in0, in1, in2, in3, 475 Outputs - out0, out1 476 Return Type - as per RTYPE 477 Details : Each unsigned byte element from 'in0' vector is added with 478 each unsigned byte element from 'in1' vector. Then the average 479 with rounding is calculated and written to 'out0' 480 */ 481 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ 482 { \ 483 out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ 484 out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ 485 } 486 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) 487 488 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 489 out2, out3) \ 490 { \ 491 AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ 492 AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ 493 } 494 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) 495 496 /* Description : Immediate number of elements to slide with zero 497 Arguments : Inputs - in0, in1, slide_val 498 Outputs - out0, out1 499 Return Type - as per RTYPE 500 Details : Byte elements from 'zero_m' vector are slid into 'in0' by 501 value specified in the 'slide_val' 502 */ 503 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ 504 { \ 505 v16i8 zero_m = { 0 }; \ 506 out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ 507 out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ 508 } 509 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) 510 511 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \ 512 slide_val) \ 513 { \ 514 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ 515 SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ 516 } 517 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) 518 519 /* Description : Immediate number of elements to slide 520 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val 521 Outputs - out0, out1 522 Return Type - as per RTYPE 523 Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by 524 value specified in the 'slide_val' 525 */ 526 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ 527 { \ 528 out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ 529 out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ 530 } 531 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) 532 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) 533 534 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \ 535 out2, slide_val) \ 536 { \ 537 SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ 538 out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ 539 } 540 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) 541 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) 542 543 /* Description : Shuffle byte vector elements as per mask vector 544 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 545 Outputs - out0, out1 546 Return Type - as per RTYPE 547 Details : Byte elements from 'in0' & 'in1' are copied selectively to 548 'out0' as per control vector 'mask0' 549 */ 550 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 551 { \ 552 out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ 553 out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ 554 } 555 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) 556 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) 557 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) 558 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__) 559 560 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \ 561 out3) \ 562 { \ 563 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ 564 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ 565 } 566 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) 567 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) 568 569 /* Description : Dot product of byte vector elements 570 Arguments : Inputs - mult0, mult1, cnst0, cnst1 571 Outputs - out0, out1 572 Return Type - as per RTYPE 573 Details : Unsigned byte elements from 'mult0' are multiplied with 574 unsigned byte elements from 'cnst0' producing a result 575 twice the size of input i.e. unsigned halfword. 576 The multiplication result of adjacent odd-even elements 577 are added together and written to the 'out0' vector 578 */ 579 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 580 { \ 581 out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ 582 out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ 583 } 584 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) 585 586 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 587 cnst3, out0, out1, out2, out3) \ 588 { \ 589 DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 590 DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 591 } 592 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) 593 594 /* Description : Dot product of byte vector elements 595 Arguments : Inputs - mult0, mult1, cnst0, cnst1 596 Outputs - out0, out1 597 Return Type - as per RTYPE 598 Details : Signed byte elements from 'mult0' are multiplied with 599 signed byte elements from 'cnst0' producing a result 600 twice the size of input i.e. signed halfword. 601 The multiplication result of adjacent odd-even elements 602 are added together and written to the 'out0' vector 603 */ 604 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 605 { \ 606 out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ 607 out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ 608 } 609 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) 610 611 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 612 cnst3, out0, out1, out2, out3) \ 613 { \ 614 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 615 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 616 } 617 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) 618 619 /* Description : Dot product of halfword vector elements 620 Arguments : Inputs - mult0, mult1, cnst0, cnst1 621 Outputs - out0, out1 622 Return Type - as per RTYPE 623 Details : Signed halfword elements from 'mult0' are multiplied with 624 signed halfword elements from 'cnst0' producing a result 625 twice the size of input i.e. signed word. 626 The multiplication result of adjacent odd-even elements 627 are added together and written to the 'out0' vector 628 */ 629 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 630 { \ 631 out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ 632 out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ 633 } 634 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) 635 636 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 637 cnst3, out0, out1, out2, out3) \ 638 { \ 639 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 640 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 641 } 642 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) 643 644 /* Description : Dot product of word vector elements 645 Arguments : Inputs - mult0, mult1, cnst0, cnst1 646 Outputs - out0, out1 647 Return Type - as per RTYPE 648 Details : Signed word elements from 'mult0' are multiplied with 649 signed word elements from 'cnst0' producing a result 650 twice the size of input i.e. signed double word. 651 The multiplication result of adjacent odd-even elements 652 are added together and written to the 'out0' vector 653 */ 654 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 655 { \ 656 out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ 657 out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ 658 } 659 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) 660 661 /* Description : Dot product & addition of byte vector elements 662 Arguments : Inputs - mult0, mult1, cnst0, cnst1 663 Outputs - out0, out1 664 Return Type - as per RTYPE 665 Details : Signed byte elements from 'mult0' are multiplied with 666 signed byte elements from 'cnst0' producing a result 667 twice the size of input i.e. signed halfword. 668 The multiplication result of adjacent odd-even elements 669 are added to the 'out0' vector 670 */ 671 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 672 { \ 673 out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ 674 out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ 675 } 676 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) 677 678 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 679 cnst3, out0, out1, out2, out3) \ 680 { \ 681 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 682 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 683 } 684 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) 685 686 /* Description : Dot product & addition of halfword vector elements 687 Arguments : Inputs - mult0, mult1, cnst0, cnst1 688 Outputs - out0, out1 689 Return Type - as per RTYPE 690 Details : Signed halfword elements from 'mult0' are multiplied with 691 signed halfword elements from 'cnst0' producing a result 692 twice the size of input i.e. signed word. 693 The multiplication result of adjacent odd-even elements 694 are added to the 'out0' vector 695 */ 696 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 697 { \ 698 out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ 699 out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ 700 } 701 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) 702 703 /* Description : Dot product & addition of double word vector elements 704 Arguments : Inputs - mult0, mult1 705 Outputs - out0, out1 706 Return Type - as per RTYPE 707 Details : Each signed word element from 'mult0' is multiplied with itself 708 producing an intermediate result twice the size of input 709 i.e. signed double word 710 The multiplication result of adjacent odd-even elements 711 are added to the 'out0' vector 712 */ 713 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ 714 { \ 715 out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ 716 out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ 717 } 718 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) 719 720 /* Description : Minimum values between unsigned elements of 721 either vector are copied to the output vector 722 Arguments : Inputs - in0, in1, min_vec 723 Outputs - in place operation 724 Return Type - as per RTYPE 725 Details : Minimum of unsigned halfword element values from 'in0' and 726 'min_vec' are written to output vector 'in0' 727 */ 728 #define MIN_UH2(RTYPE, in0, in1, min_vec) \ 729 { \ 730 in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ 731 in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ 732 } 733 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) 734 735 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \ 736 { \ 737 MIN_UH2(RTYPE, in0, in1, min_vec); \ 738 MIN_UH2(RTYPE, in2, in3, min_vec); \ 739 } 740 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) 741 742 /* Description : Clips all signed halfword elements of input vector 743 between 0 & 255 744 Arguments : Input - in 745 Output - out_m 746 Return Type - signed halfword 747 */ 748 #define CLIP_SH_0_255(in) \ 749 ({ \ 750 v8i16 max_m = __msa_ldi_h(255); \ 751 v8i16 out_m; \ 752 \ 753 out_m = __msa_maxi_s_h((v8i16)in, 0); \ 754 out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ 755 out_m; \ 756 }) 757 #define CLIP_SH2_0_255(in0, in1) \ 758 { \ 759 in0 = CLIP_SH_0_255(in0); \ 760 in1 = CLIP_SH_0_255(in1); \ 761 } 762 #define CLIP_SH4_0_255(in0, in1, in2, in3) \ 763 { \ 764 CLIP_SH2_0_255(in0, in1); \ 765 CLIP_SH2_0_255(in2, in3); \ 766 } 767 768 /* Description : Horizontal addition of 4 signed word elements of input vector 769 Arguments : Input - in (signed word vector) 770 Output - sum_m (i32 sum) 771 Return Type - signed word (GP) 772 Details : 4 signed word elements of 'in' vector are added together and 773 the resulting integer sum is returned 774 */ 775 #define HADD_SW_S32(in) \ 776 ({ \ 777 v2i64 res0_m, res1_m; \ 778 int32_t sum_m; \ 779 \ 780 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ 781 res1_m = __msa_splati_d(res0_m, 1); \ 782 res0_m = res0_m + res1_m; \ 783 sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ 784 sum_m; \ 785 }) 786 787 /* Description : Horizontal addition of 4 unsigned word elements 788 Arguments : Input - in (unsigned word vector) 789 Output - sum_m (u32 sum) 790 Return Type - unsigned word (GP) 791 Details : 4 unsigned word elements of 'in' vector are added together and 792 the resulting integer sum is returned 793 */ 794 #define HADD_UW_U32(in) \ 795 ({ \ 796 v2u64 res0_m, res1_m; \ 797 uint32_t sum_m; \ 798 \ 799 res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \ 800 res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ 801 res0_m += res1_m; \ 802 sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ 803 sum_m; \ 804 }) 805 806 /* Description : Horizontal addition of 8 unsigned halfword elements 807 Arguments : Input - in (unsigned halfword vector) 808 Output - sum_m (u32 sum) 809 Return Type - unsigned word 810 Details : 8 unsigned halfword elements of 'in' vector are added 811 together and the resulting integer sum is returned 812 */ 813 #define HADD_UH_U32(in) \ 814 ({ \ 815 v4u32 res_m; \ 816 uint32_t sum_m; \ 817 \ 818 res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ 819 sum_m = HADD_UW_U32(res_m); \ 820 sum_m; \ 821 }) 822 823 /* Description : Horizontal addition of unsigned byte vector elements 824 Arguments : Inputs - in0, in1 825 Outputs - out0, out1 826 Return Type - as per RTYPE 827 Details : Each unsigned odd byte element from 'in0' is added to 828 even unsigned byte element from 'in0' (pairwise) and the 829 halfword result is written to 'out0' 830 */ 831 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \ 832 { \ 833 out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ 834 out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ 835 } 836 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) 837 838 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ 839 { \ 840 HADD_UB2(RTYPE, in0, in1, out0, out1); \ 841 HADD_UB2(RTYPE, in2, in3, out2, out3); \ 842 } 843 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) 844 845 /* Description : Horizontal subtraction of unsigned byte vector elements 846 Arguments : Inputs - in0, in1 847 Outputs - out0, out1 848 Return Type - as per RTYPE 849 Details : Each unsigned odd byte element from 'in0' is subtracted from 850 even unsigned byte element from 'in0' (pairwise) and the 851 halfword result is written to 'out0' 852 */ 853 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ 854 { \ 855 out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ 856 out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ 857 } 858 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) 859 860 /* Description : SAD (Sum of Absolute Difference) 861 Arguments : Inputs - in0, in1, ref0, ref1 862 Outputs - sad_m (halfword vector) 863 Return Type - unsigned halfword 864 Details : Absolute difference of all the byte elements from 'in0' with 865 'ref0' is calculated and preserved in 'diff0'. Then even-odd 866 pairs are added together to generate 8 halfword results. 867 */ 868 #define SAD_UB2_UH(in0, in1, ref0, ref1) \ 869 ({ \ 870 v16u8 diff0_m, diff1_m; \ 871 v8u16 sad_m = { 0 }; \ 872 \ 873 diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \ 874 diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \ 875 \ 876 sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \ 877 sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \ 878 \ 879 sad_m; \ 880 }) 881 882 /* Description : Horizontal subtraction of signed halfword vector elements 883 Arguments : Inputs - in0, in1 884 Outputs - out0, out1 885 Return Type - as per RTYPE 886 Details : Each signed odd halfword element from 'in0' is subtracted from 887 even signed halfword element from 'in0' (pairwise) and the 888 word result is written to 'out0' 889 */ 890 #define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ 891 { \ 892 out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ 893 out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ 894 } 895 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) 896 897 /* Description : Set element n input vector to GPR value 898 Arguments : Inputs - in0, in1, in2, in3 899 Output - out 900 Return Type - as per RTYPE 901 Details : Set element 0 in vector 'out' to value specified in 'in0' 902 */ 903 #define INSERT_W2(RTYPE, in0, in1, out) \ 904 { \ 905 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ 906 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ 907 } 908 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) 909 910 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \ 911 { \ 912 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ 913 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ 914 out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ 915 out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ 916 } 917 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) 918 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) 919 920 #define INSERT_D2(RTYPE, in0, in1, out) \ 921 { \ 922 out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ 923 out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ 924 } 925 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) 926 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) 927 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__) 928 929 /* Description : Interleave even byte elements from vectors 930 Arguments : Inputs - in0, in1, in2, in3 931 Outputs - out0, out1 932 Return Type - as per RTYPE 933 Details : Even byte elements of 'in0' and 'in1' are interleaved 934 and written to 'out0' 935 */ 936 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 937 { \ 938 out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ 939 out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ 940 } 941 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) 942 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) 943 944 /* Description : Interleave even halfword elements from vectors 945 Arguments : Inputs - in0, in1, in2, in3 946 Outputs - out0, out1 947 Return Type - as per RTYPE 948 Details : Even halfword elements of 'in0' and 'in1' are interleaved 949 and written to 'out0' 950 */ 951 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 952 { \ 953 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ 954 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ 955 } 956 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) 957 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) 958 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) 959 960 /* Description : Interleave even word elements from vectors 961 Arguments : Inputs - in0, in1, in2, in3 962 Outputs - out0, out1 963 Return Type - as per RTYPE 964 Details : Even word elements of 'in0' and 'in1' are interleaved 965 and written to 'out0' 966 */ 967 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 968 { \ 969 out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ 970 out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ 971 } 972 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) 973 974 /* Description : Interleave even double word elements from vectors 975 Arguments : Inputs - in0, in1, in2, in3 976 Outputs - out0, out1 977 Return Type - as per RTYPE 978 Details : Even double word elements of 'in0' and 'in1' are interleaved 979 and written to 'out0' 980 */ 981 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 982 { \ 983 out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ 984 out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ 985 } 986 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) 987 988 /* Description : Interleave left half of byte elements from vectors 989 Arguments : Inputs - in0, in1, in2, in3 990 Outputs - out0, out1 991 Return Type - as per RTYPE 992 Details : Left half of byte elements of 'in0' and 'in1' are interleaved 993 and written to 'out0'. 994 */ 995 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 996 { \ 997 out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 998 out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ 999 } 1000 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) 1001 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) 1002 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) 1003 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) 1004 1005 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1006 out2, out3) \ 1007 { \ 1008 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1009 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1010 } 1011 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) 1012 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) 1013 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) 1014 1015 /* Description : Interleave left half of halfword elements from vectors 1016 Arguments : Inputs - in0, in1, in2, in3 1017 Outputs - out0, out1 1018 Return Type - as per RTYPE 1019 Details : Left half of halfword elements of 'in0' and 'in1' are 1020 interleaved and written to 'out0'. 1021 */ 1022 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1023 { \ 1024 out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 1025 out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ 1026 } 1027 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) 1028 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) 1029 1030 /* Description : Interleave left half of word elements from vectors 1031 Arguments : Inputs - in0, in1, in2, in3 1032 Outputs - out0, out1 1033 Return Type - as per RTYPE 1034 Details : Left half of word elements of 'in0' and 'in1' are interleaved 1035 and written to 'out0'. 1036 */ 1037 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1038 { \ 1039 out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 1040 out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ 1041 } 1042 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) 1043 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) 1044 1045 /* Description : Interleave right half of byte elements from vectors 1046 Arguments : Inputs - in0, in1, in2, in3 1047 Outputs - out0, out1 1048 Return Type - as per RTYPE 1049 Details : Right half of byte elements of 'in0' and 'in1' are interleaved 1050 and written to out0. 1051 */ 1052 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1053 { \ 1054 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 1055 out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ 1056 } 1057 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) 1058 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) 1059 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) 1060 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) 1061 1062 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1063 out2, out3) \ 1064 { \ 1065 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1066 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1067 } 1068 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) 1069 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) 1070 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) 1071 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) 1072 1073 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ 1074 in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \ 1075 out5, out6, out7) \ 1076 { \ 1077 ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ 1078 out3); \ 1079 ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \ 1080 out6, out7); \ 1081 } 1082 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) 1083 1084 /* Description : Interleave right half of halfword elements from vectors 1085 Arguments : Inputs - in0, in1, in2, in3 1086 Outputs - out0, out1 1087 Return Type - as per RTYPE 1088 Details : Right half of halfword elements of 'in0' and 'in1' are 1089 interleaved and written to 'out0'. 1090 */ 1091 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1092 { \ 1093 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 1094 out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ 1095 } 1096 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) 1097 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) 1098 1099 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1100 out2, out3) \ 1101 { \ 1102 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1103 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1104 } 1105 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) 1106 1107 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1108 { \ 1109 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 1110 out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ 1111 } 1112 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) 1113 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) 1114 1115 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1116 out2, out3) \ 1117 { \ 1118 ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1119 ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1120 } 1121 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) 1122 1123 /* Description : Interleave right half of double word elements from vectors 1124 Arguments : Inputs - in0, in1, in2, in3 1125 Outputs - out0, out1 1126 Return Type - as per RTYPE 1127 Details : Right half of double word elements of 'in0' and 'in1' are 1128 interleaved and written to 'out0'. 1129 */ 1130 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1131 { \ 1132 out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ 1133 out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ 1134 } 1135 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) 1136 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) 1137 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) 1138 1139 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ 1140 { \ 1141 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1142 out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ 1143 } 1144 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) 1145 1146 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1147 out2, out3) \ 1148 { \ 1149 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1150 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1151 } 1152 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) 1153 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) 1154 1155 /* Description : Interleave both left and right half of input vectors 1156 Arguments : Inputs - in0, in1 1157 Outputs - out0, out1 1158 Return Type - as per RTYPE 1159 Details : Right half of byte elements from 'in0' and 'in1' are 1160 interleaved and written to 'out0' 1161 */ 1162 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ 1163 { \ 1164 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 1165 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 1166 } 1167 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) 1168 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) 1169 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) 1170 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) 1171 1172 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ 1173 { \ 1174 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 1175 out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 1176 } 1177 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) 1178 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) 1179 1180 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ 1181 { \ 1182 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 1183 out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 1184 } 1185 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) 1186 #define ILVRL_W2_SB(...) ILVRL_W2(v16i8, __VA_ARGS__) 1187 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) 1188 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) 1189 1190 /* Description : Saturate the halfword element values to the max 1191 unsigned value of (sat_val + 1) bits 1192 The element data width remains unchanged 1193 Arguments : Inputs - in0, in1, sat_val 1194 Outputs - in place operation 1195 Return Type - as per RTYPE 1196 Details : Each unsigned halfword element from 'in0' is saturated to the 1197 value generated with (sat_val + 1) bit range. 1198 The results are written in place 1199 */ 1200 #define SAT_UH2(RTYPE, in0, in1, sat_val) \ 1201 { \ 1202 in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ 1203 in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ 1204 } 1205 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) 1206 1207 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ 1208 { \ 1209 SAT_UH2(RTYPE, in0, in1, sat_val); \ 1210 SAT_UH2(RTYPE, in2, in3, sat_val) \ 1211 } 1212 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) 1213 1214 /* Description : Saturate the halfword element values to the max 1215 unsigned value of (sat_val + 1) bits 1216 The element data width remains unchanged 1217 Arguments : Inputs - in0, in1, sat_val 1218 Outputs - in place operation 1219 Return Type - as per RTYPE 1220 Details : Each unsigned halfword element from 'in0' is saturated to the 1221 value generated with (sat_val + 1) bit range 1222 The results are written in place 1223 */ 1224 #define SAT_SH2(RTYPE, in0, in1, sat_val) \ 1225 { \ 1226 in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ 1227 in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ 1228 } 1229 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) 1230 1231 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ 1232 { \ 1233 SAT_SH2(RTYPE, in0, in1, sat_val); \ 1234 SAT_SH2(RTYPE, in2, in3, sat_val); \ 1235 } 1236 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) 1237 1238 /* Description : Indexed halfword element values are replicated to all 1239 elements in output vector 1240 Arguments : Inputs - in, idx0, idx1 1241 Outputs - out0, out1 1242 Return Type - as per RTYPE 1243 Details : 'idx0' element value from 'in' vector is replicated to all 1244 elements in 'out0' vector 1245 Valid index range for halfword operation is 0-7 1246 */ 1247 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ 1248 { \ 1249 out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ 1250 out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ 1251 } 1252 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) 1253 1254 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \ 1255 { \ 1256 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ 1257 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ 1258 } 1259 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) 1260 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) 1261 1262 /* Description : Pack even byte elements of vector pairs 1263 Arguments : Inputs - in0, in1, in2, in3 1264 Outputs - out0, out1 1265 Return Type - as per RTYPE 1266 Details : Even byte elements of 'in0' are copied to the left half of 1267 'out0' & even byte elements of 'in1' are copied to the right 1268 half of 'out0'. 1269 */ 1270 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1271 { \ 1272 out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ 1273 out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ 1274 } 1275 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) 1276 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) 1277 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) 1278 1279 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1280 out2, out3) \ 1281 { \ 1282 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1283 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1284 } 1285 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) 1286 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) 1287 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) 1288 1289 /* Description : Pack even halfword elements of vector pairs 1290 Arguments : Inputs - in0, in1, in2, in3 1291 Outputs - out0, out1 1292 Return Type - as per RTYPE 1293 Details : Even halfword elements of 'in0' are copied to the left half of 1294 'out0' & even halfword elements of 'in1' are copied to the 1295 right half of 'out0'. 1296 */ 1297 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1298 { \ 1299 out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ 1300 out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ 1301 } 1302 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) 1303 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) 1304 1305 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1306 out2, out3) \ 1307 { \ 1308 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1309 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1310 } 1311 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) 1312 1313 /* Description : Pack even double word elements of vector pairs 1314 Arguments : Inputs - in0, in1, in2, in3 1315 Outputs - out0, out1 1316 Return Type - as per RTYPE 1317 Details : Even double elements of 'in0' are copied to the left half of 1318 'out0' & even double elements of 'in1' are copied to the right 1319 half of 'out0'. 1320 */ 1321 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1322 { \ 1323 out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ 1324 out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ 1325 } 1326 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) 1327 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) 1328 1329 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1330 out2, out3) \ 1331 { \ 1332 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1333 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1334 } 1335 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) 1336 1337 /* Description : Each byte element is logically xor'ed with immediate 128 1338 Arguments : Inputs - in0, in1 1339 Outputs - in place operation 1340 Return Type - as per RTYPE 1341 Details : Each unsigned byte element from input vector 'in0' is 1342 logically xor'ed with 128 and the result is stored in-place. 1343 */ 1344 #define XORI_B2_128(RTYPE, in0, in1) \ 1345 { \ 1346 in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ 1347 in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ 1348 } 1349 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) 1350 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) 1351 1352 #define XORI_B3_128(RTYPE, in0, in1, in2) \ 1353 { \ 1354 XORI_B2_128(RTYPE, in0, in1); \ 1355 in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ 1356 } 1357 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) 1358 1359 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ 1360 { \ 1361 XORI_B2_128(RTYPE, in0, in1); \ 1362 XORI_B2_128(RTYPE, in2, in3); \ 1363 } 1364 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) 1365 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) 1366 1367 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \ 1368 { \ 1369 XORI_B4_128(RTYPE, in0, in1, in2, in3); \ 1370 XORI_B3_128(RTYPE, in4, in5, in6); \ 1371 } 1372 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) 1373 1374 /* Description : Average of signed halfword elements -> (a + b) / 2 1375 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1376 Outputs - out0, out1, out2, out3 1377 Return Type - as per RTYPE 1378 Details : Each signed halfword element from 'in0' is added to each 1379 signed halfword element of 'in1' with full precision resulting 1380 in one extra bit in the result. The result is then divided by 1381 2 and written to 'out0' 1382 */ 1383 #define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1384 out2, out3) \ 1385 { \ 1386 out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ 1387 out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ 1388 out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ 1389 out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ 1390 } 1391 #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) 1392 1393 /* Description : Addition of signed halfword elements and signed saturation 1394 Arguments : Inputs - in0, in1, in2, in3 1395 Outputs - out0, out1 1396 Return Type - as per RTYPE 1397 Details : Signed halfword elements from 'in0' are added to signed 1398 halfword elements of 'in1'. The result is then signed saturated 1399 between halfword data type range 1400 */ 1401 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1402 { \ 1403 out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ 1404 out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ 1405 } 1406 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) 1407 1408 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1409 out2, out3) \ 1410 { \ 1411 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1412 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1413 } 1414 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) 1415 1416 /* Description : Shift left all elements of vector (generic for all data types) 1417 Arguments : Inputs - in0, in1, in2, in3, shift 1418 Outputs - in place operation 1419 Return Type - as per input vector RTYPE 1420 Details : Each element of vector 'in0' is left shifted by 'shift' and 1421 the result is written in-place. 1422 */ 1423 #define SLLI_4V(in0, in1, in2, in3, shift) \ 1424 { \ 1425 in0 = in0 << shift; \ 1426 in1 = in1 << shift; \ 1427 in2 = in2 << shift; \ 1428 in3 = in3 << shift; \ 1429 } 1430 1431 /* Description : Arithmetic shift right all elements of vector 1432 (generic for all data types) 1433 Arguments : Inputs - in0, in1, in2, in3, shift 1434 Outputs - in place operation 1435 Return Type - as per input vector RTYPE 1436 Details : Each element of vector 'in0' is right shifted by 'shift' and 1437 the result is written in-place. 'shift' is a GP variable. 1438 */ 1439 #define SRA_2V(in0, in1, shift) \ 1440 { \ 1441 in0 = in0 >> shift; \ 1442 in1 = in1 >> shift; \ 1443 } 1444 1445 #define SRA_4V(in0, in1, in2, in3, shift) \ 1446 { \ 1447 in0 = in0 >> shift; \ 1448 in1 = in1 >> shift; \ 1449 in2 = in2 >> shift; \ 1450 in3 = in3 >> shift; \ 1451 } 1452 1453 /* Description : Shift right arithmetic rounded words 1454 Arguments : Inputs - in0, in1, shift 1455 Outputs - in place operation 1456 Return Type - as per RTYPE 1457 Details : Each element of vector 'in0' is shifted right arithmetically by 1458 the number of bits in the corresponding element in the vector 1459 'shift'. The last discarded bit is added to shifted value for 1460 rounding and the result is written in-place. 1461 'shift' is a vector. 1462 */ 1463 #define SRAR_W2(RTYPE, in0, in1, shift) \ 1464 { \ 1465 in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ 1466 in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ 1467 } 1468 1469 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ 1470 { \ 1471 SRAR_W2(RTYPE, in0, in1, shift) \ 1472 SRAR_W2(RTYPE, in2, in3, shift) \ 1473 } 1474 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) 1475 1476 /* Description : Shift right arithmetic rounded (immediate) 1477 Arguments : Inputs - in0, in1, shift 1478 Outputs - in place operation 1479 Return Type - as per RTYPE 1480 Details : Each element of vector 'in0' is shifted right arithmetically by 1481 the value in 'shift'. The last discarded bit is added to the 1482 shifted value for rounding and the result is written in-place. 1483 'shift' is an immediate value. 1484 */ 1485 #define SRARI_H2(RTYPE, in0, in1, shift) \ 1486 { \ 1487 in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ 1488 in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ 1489 } 1490 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) 1491 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) 1492 1493 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ 1494 { \ 1495 SRARI_H2(RTYPE, in0, in1, shift); \ 1496 SRARI_H2(RTYPE, in2, in3, shift); \ 1497 } 1498 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) 1499 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) 1500 1501 #define SRARI_W2(RTYPE, in0, in1, shift) \ 1502 { \ 1503 in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ 1504 in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ 1505 } 1506 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) 1507 1508 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ 1509 { \ 1510 SRARI_W2(RTYPE, in0, in1, shift); \ 1511 SRARI_W2(RTYPE, in2, in3, shift); \ 1512 } 1513 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) 1514 1515 /* Description : Logical shift right all elements of vector (immediate) 1516 Arguments : Inputs - in0, in1, in2, in3, shift 1517 Outputs - out0, out1, out2, out3 1518 Return Type - as per RTYPE 1519 Details : Each element of vector 'in0' is right shifted by 'shift' and 1520 the result is written in-place. 'shift' is an immediate value. 1521 */ 1522 #define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \ 1523 { \ 1524 out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ 1525 out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ 1526 out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ 1527 out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ 1528 } 1529 #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) 1530 1531 /* Description : Multiplication of pairs of vectors 1532 Arguments : Inputs - in0, in1, in2, in3 1533 Outputs - out0, out1 1534 Details : Each element from 'in0' is multiplied with elements from 'in1' 1535 and the result is written to 'out0' 1536 */ 1537 #define MUL2(in0, in1, in2, in3, out0, out1) \ 1538 { \ 1539 out0 = in0 * in1; \ 1540 out1 = in2 * in3; \ 1541 } 1542 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1543 { \ 1544 MUL2(in0, in1, in2, in3, out0, out1); \ 1545 MUL2(in4, in5, in6, in7, out2, out3); \ 1546 } 1547 1548 /* Description : Addition of 2 pairs of vectors 1549 Arguments : Inputs - in0, in1, in2, in3 1550 Outputs - out0, out1 1551 Details : Each element in 'in0' is added to 'in1' and result is written 1552 to 'out0'. 1553 */ 1554 #define ADD2(in0, in1, in2, in3, out0, out1) \ 1555 { \ 1556 out0 = in0 + in1; \ 1557 out1 = in2 + in3; \ 1558 } 1559 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1560 { \ 1561 ADD2(in0, in1, in2, in3, out0, out1); \ 1562 ADD2(in4, in5, in6, in7, out2, out3); \ 1563 } 1564 1565 /* Description : Subtraction of 2 pairs of vectors 1566 Arguments : Inputs - in0, in1, in2, in3 1567 Outputs - out0, out1 1568 Details : Each element in 'in1' is subtracted from 'in0' and result is 1569 written to 'out0'. 1570 */ 1571 #define SUB2(in0, in1, in2, in3, out0, out1) \ 1572 { \ 1573 out0 = in0 - in1; \ 1574 out1 = in2 - in3; \ 1575 } 1576 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1577 { \ 1578 out0 = in0 - in1; \ 1579 out1 = in2 - in3; \ 1580 out2 = in4 - in5; \ 1581 out3 = in6 - in7; \ 1582 } 1583 1584 /* Description : Sign extend halfword elements from right half of the vector 1585 Arguments : Input - in (halfword vector) 1586 Output - out (sign extended word vector) 1587 Return Type - signed word 1588 Details : Sign bit of halfword elements from input vector 'in' is 1589 extracted and interleaved with same vector 'in0' to generate 1590 4 word elements keeping sign intact 1591 */ 1592 #define UNPCK_R_SH_SW(in, out) \ 1593 { \ 1594 v8i16 sign_m; \ 1595 \ 1596 sign_m = __msa_clti_s_h((v8i16)in, 0); \ 1597 out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ 1598 } 1599 1600 /* Description : Sign extend byte elements from input vector and return 1601 halfword results in pair of vectors 1602 Arguments : Input - in (byte vector) 1603 Outputs - out0, out1 (sign extended halfword vectors) 1604 Return Type - signed halfword 1605 Details : Sign bit of byte elements from input vector 'in' is 1606 extracted and interleaved right with same vector 'in0' to 1607 generate 8 signed halfword elements in 'out0' 1608 Then interleaved left with same vector 'in0' to 1609 generate 8 signed halfword elements in 'out1' 1610 */ 1611 #define UNPCK_SB_SH(in, out0, out1) \ 1612 { \ 1613 v16i8 tmp_m; \ 1614 \ 1615 tmp_m = __msa_clti_s_b((v16i8)in, 0); \ 1616 ILVRL_B2_SH(tmp_m, in, out0, out1); \ 1617 } 1618 1619 /* Description : Zero extend unsigned byte elements to halfword elements 1620 Arguments : Input - in (unsigned byte vector) 1621 Outputs - out0, out1 (unsigned halfword vectors) 1622 Return Type - signed halfword 1623 Details : Zero extended right half of vector is returned in 'out0' 1624 Zero extended left half of vector is returned in 'out1' 1625 */ 1626 #define UNPCK_UB_SH(in, out0, out1) \ 1627 { \ 1628 v16i8 zero_m = { 0 }; \ 1629 \ 1630 ILVRL_B2_SH(zero_m, in, out0, out1); \ 1631 } 1632 1633 /* Description : Sign extend halfword elements from input vector and return 1634 the result in pair of vectors 1635 Arguments : Input - in (halfword vector) 1636 Outputs - out0, out1 (sign extended word vectors) 1637 Return Type - signed word 1638 Details : Sign bit of halfword elements from input vector 'in' is 1639 extracted and interleaved right with same vector 'in0' to 1640 generate 4 signed word elements in 'out0' 1641 Then interleaved left with same vector 'in0' to 1642 generate 4 signed word elements in 'out1' 1643 */ 1644 #define UNPCK_SH_SW(in, out0, out1) \ 1645 { \ 1646 v8i16 tmp_m; \ 1647 \ 1648 tmp_m = __msa_clti_s_h((v8i16)in, 0); \ 1649 ILVRL_H2_SW(tmp_m, in, out0, out1); \ 1650 } 1651 1652 /* Description : Butterfly of 4 input vectors 1653 Arguments : Inputs - in0, in1, in2, in3 1654 Outputs - out0, out1, out2, out3 1655 Details : Butterfly operation 1656 */ 1657 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ 1658 { \ 1659 out0 = in0 + in3; \ 1660 out1 = in1 + in2; \ 1661 \ 1662 out2 = in1 - in2; \ 1663 out3 = in0 - in3; \ 1664 } 1665 1666 /* Description : Butterfly of 8 input vectors 1667 Arguments : Inputs - in0 ... in7 1668 Outputs - out0 .. out7 1669 Details : Butterfly operation 1670 */ 1671 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ 1672 out3, out4, out5, out6, out7) \ 1673 { \ 1674 out0 = in0 + in7; \ 1675 out1 = in1 + in6; \ 1676 out2 = in2 + in5; \ 1677 out3 = in3 + in4; \ 1678 \ 1679 out4 = in3 - in4; \ 1680 out5 = in2 - in5; \ 1681 out6 = in1 - in6; \ 1682 out7 = in0 - in7; \ 1683 } 1684 1685 /* Description : Butterfly of 16 input vectors 1686 Arguments : Inputs - in0 ... in15 1687 Outputs - out0 .. out15 1688 Details : Butterfly operation 1689 */ 1690 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ 1691 in11, in12, in13, in14, in15, out0, out1, out2, out3, \ 1692 out4, out5, out6, out7, out8, out9, out10, out11, out12, \ 1693 out13, out14, out15) \ 1694 { \ 1695 out0 = in0 + in15; \ 1696 out1 = in1 + in14; \ 1697 out2 = in2 + in13; \ 1698 out3 = in3 + in12; \ 1699 out4 = in4 + in11; \ 1700 out5 = in5 + in10; \ 1701 out6 = in6 + in9; \ 1702 out7 = in7 + in8; \ 1703 \ 1704 out8 = in7 - in8; \ 1705 out9 = in6 - in9; \ 1706 out10 = in5 - in10; \ 1707 out11 = in4 - in11; \ 1708 out12 = in3 - in12; \ 1709 out13 = in2 - in13; \ 1710 out14 = in1 - in14; \ 1711 out15 = in0 - in15; \ 1712 } 1713 1714 /* Description : Transpose input 8x8 byte block 1715 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1716 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1717 Return Type - as per RTYPE 1718 */ 1719 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ 1720 out1, out2, out3, out4, out5, out6, out7) \ 1721 { \ 1722 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1723 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1724 \ 1725 ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \ 1726 tmp3_m); \ 1727 ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ 1728 ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ 1729 ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ 1730 ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ 1731 SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ 1732 SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ 1733 } 1734 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) 1735 1736 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors 1737 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 1738 in8, in9, in10, in11, in12, in13, in14, in15 1739 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1740 Return Type - unsigned byte 1741 */ 1742 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ 1743 in10, in11, in12, in13, in14, in15, out0, out1, \ 1744 out2, out3, out4, out5, out6, out7) \ 1745 { \ 1746 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1747 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1748 \ 1749 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ 1750 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ 1751 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ 1752 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ 1753 \ 1754 tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ 1755 tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ 1756 tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ 1757 tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ 1758 out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ 1759 tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ 1760 out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ 1761 tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ 1762 \ 1763 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ 1764 out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1765 out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1766 \ 1767 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ 1768 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ 1769 out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1770 out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1771 \ 1772 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ 1773 out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1774 out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1775 \ 1776 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ 1777 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ 1778 out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1779 out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1780 } 1781 1782 /* Description : Transpose 4x4 block with half word elements in vectors 1783 Arguments : Inputs - in0, in1, in2, in3 1784 Outputs - out0, out1, out2, out3 1785 Return Type - signed halfword 1786 */ 1787 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ 1788 { \ 1789 v8i16 s0_m, s1_m; \ 1790 \ 1791 ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ 1792 ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ 1793 out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ 1794 out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ 1795 } 1796 1797 /* Description : Transpose 4x8 block with half word elements in vectors 1798 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1799 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1800 Return Type - signed halfword 1801 */ 1802 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1803 out2, out3, out4, out5, out6, out7) \ 1804 { \ 1805 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1806 v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ 1807 v8i16 zero_m = { 0 }; \ 1808 \ 1809 ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \ 1810 tmp3_n); \ 1811 ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ 1812 ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ 1813 \ 1814 out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ 1815 out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ 1816 out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ 1817 out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ 1818 \ 1819 out4 = zero_m; \ 1820 out5 = zero_m; \ 1821 out6 = zero_m; \ 1822 out7 = zero_m; \ 1823 } 1824 1825 /* Description : Transpose 8x4 block with half word elements in vectors 1826 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1827 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1828 Return Type - signed halfword 1829 */ 1830 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ 1831 { \ 1832 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1833 \ 1834 ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ 1835 ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ 1836 ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ 1837 ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ 1838 } 1839 1840 /* Description : Transpose 8x8 block with half word elements in vectors 1841 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1842 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1843 Return Type - as per RTYPE 1844 */ 1845 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ 1846 out1, out2, out3, out4, out5, out6, out7) \ 1847 { \ 1848 v8i16 s0_m, s1_m; \ 1849 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1850 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1851 \ 1852 ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ 1853 ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ 1854 ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ 1855 ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ 1856 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 1857 ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ 1858 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 1859 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ 1860 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \ 1861 tmp7_m, out0, out2, out4, out6); \ 1862 out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ 1863 out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ 1864 out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ 1865 out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ 1866 } 1867 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) 1868 1869 /* Description : Transpose 4x4 block with word elements in vectors 1870 Arguments : Inputs - in0, in1, in2, in3 1871 Outputs - out0, out1, out2, out3 1872 Return Type - signed word 1873 */ 1874 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ 1875 { \ 1876 v4i32 s0_m, s1_m, s2_m, s3_m; \ 1877 \ 1878 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ 1879 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ 1880 \ 1881 out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ 1882 out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ 1883 out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ 1884 out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ 1885 } 1886 1887 /* Description : Add block 4x4 1888 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 1889 Details : Least significant 4 bytes from each input vector are added to 1890 the destination bytes, clipped between 0-255 and stored. 1891 */ 1892 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ 1893 { \ 1894 uint32_t src0_m, src1_m, src2_m, src3_m; \ 1895 v8i16 inp0_m, inp1_m, res0_m, res1_m; \ 1896 v16i8 dst0_m = { 0 }; \ 1897 v16i8 dst1_m = { 0 }; \ 1898 v16i8 zero_m = { 0 }; \ 1899 \ 1900 ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ 1901 LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ 1902 INSERT_W2_SB(src0_m, src1_m, dst0_m); \ 1903 INSERT_W2_SB(src2_m, src3_m, dst1_m); \ 1904 ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ 1905 ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ 1906 CLIP_SH2_0_255(res0_m, res1_m); \ 1907 PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ 1908 ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ 1909 } 1910 1911 /* Description : Pack even elements of input vectors & xor with 128 1912 Arguments : Inputs - in0, in1 1913 Output - out_m 1914 Return Type - unsigned byte 1915 Details : Signed byte even elements from 'in0' and 'in1' are packed 1916 together in one vector and the resulting vector is xor'ed with 1917 128 to shift the range from signed to unsigned byte 1918 */ 1919 #define PCKEV_XORI128_UB(in0, in1) \ 1920 ({ \ 1921 v16u8 out_m; \ 1922 \ 1923 out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ 1924 out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ 1925 out_m; \ 1926 }) 1927 1928 /* Description : Converts inputs to unsigned bytes, interleave, average & store 1929 as 8x4 unsigned byte block 1930 Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride 1931 */ 1932 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \ 1933 { \ 1934 v16u8 tmp0_m, tmp1_m; \ 1935 uint8_t *pdst_m = (uint8_t *)(pdst); \ 1936 \ 1937 tmp0_m = PCKEV_XORI128_UB(in0, in1); \ 1938 tmp1_m = PCKEV_XORI128_UB(in2, in3); \ 1939 AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ 1940 ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ 1941 } 1942 1943 /* Description : Pack even byte elements and store byte vector in destination 1944 memory 1945 Arguments : Inputs - in0, in1, pdst 1946 */ 1947 #define PCKEV_ST_SB(in0, in1, pdst) \ 1948 { \ 1949 v16i8 tmp_m; \ 1950 \ 1951 tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ 1952 ST_SB(tmp_m, (pdst)); \ 1953 } 1954 1955 /* Description : Horizontal 2 tap filter kernel code 1956 Arguments : Inputs - in0, in1, mask, coeff, shift 1957 */ 1958 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ 1959 ({ \ 1960 v16i8 tmp0_m; \ 1961 v8u16 tmp1_m; \ 1962 \ 1963 tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ 1964 tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ 1965 tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ 1966 \ 1967 tmp1_m; \ 1968 }) 1969 #endif // VPX_VPX_DSP_MIPS_MACROS_MSA_H_ 1970