1 /* 2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H 22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H 23 24 #include <stdint.h> 25 #include <msa.h> 26 #include <config.h> 27 28 #define ALIGNMENT 16 29 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1))) 30 31 #define LD_V(RTYPE, psrc) *((RTYPE *)(psrc)) 32 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__) 33 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__) 34 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__) 35 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__) 36 #define LD_UW(...) LD_V(v4u32, __VA_ARGS__) 37 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__) 38 39 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 40 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__) 41 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__) 42 #define ST_UH(...) ST_V(v8u16, __VA_ARGS__) 43 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__) 44 #define ST_UW(...) ST_V(v4u32, __VA_ARGS__) 45 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__) 46 47 #if (__mips_isa_rev >= 6) 48 #define LH(psrc) \ 49 ( { \ 50 uint16_t val_lh_m = *(uint16_t *)(psrc); \ 51 val_lh_m; \ 52 } ) 53 54 #define LW(psrc) \ 55 ( { \ 56 uint32_t val_lw_m = *(uint32_t *)(psrc); \ 57 val_lw_m; \ 58 } ) 59 60 #if (__mips == 64) 61 #define LD(psrc) \ 62 ( { \ 63 uint64_t val_ld_m = *(uint64_t *)(psrc); \ 64 val_ld_m; \ 65 } ) 66 #else // !(__mips == 64) 67 #define LD(psrc) \ 68 ( { \ 69 uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ 70 uint32_t val0_ld_m, val1_ld_m; \ 71 uint64_t val_ld_m = 0; \ 72 \ 73 val0_ld_m = LW(psrc_ld_m); \ 74 val1_ld_m = LW(psrc_ld_m + 4); \ 75 \ 76 val_ld_m = (uint64_t) (val1_ld_m); \ 77 val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \ 78 val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \ 79 \ 80 val_ld_m; \ 81 } ) 82 #endif // (__mips == 64) 83 84 #define SH(val, pdst) *(uint16_t *)(pdst) = (val); 85 #define SW(val, pdst) *(uint32_t *)(pdst) = (val); 86 #define SD(val, pdst) *(uint64_t *)(pdst) = (val); 87 88 #else // !(__mips_isa_rev >= 6) 89 #define LH(psrc) \ 90 ( { \ 91 uint8_t *psrc_lh_m = (uint8_t *) (psrc); \ 92 uint16_t val_lh_m; \ 93 \ 94 __asm__ volatile ( \ 95 "ulh %[val_lh_m], %[psrc_lh_m] \n\t" \ 96 \ 97 : [val_lh_m] "=r" (val_lh_m) \ 98 : [psrc_lh_m] "m" (*psrc_lh_m) \ 99 ); \ 100 \ 101 val_lh_m; \ 102 } ) 103 104 #define LW(psrc) \ 105 ( { \ 106 uint8_t *psrc_lw_m = (uint8_t *) (psrc); \ 107 uint32_t val_lw_m; \ 108 \ 109 __asm__ volatile ( \ 110 "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ 111 "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ 112 \ 113 : [val_lw_m] "=&r"(val_lw_m) \ 114 : [psrc_lw_m] "r"(psrc_lw_m) \ 115 ); \ 116 \ 117 val_lw_m; \ 118 } ) 119 120 #if (__mips == 64) 121 #define LD(psrc) \ 122 ( { \ 123 uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ 124 uint64_t val_ld_m = 0; \ 125 \ 126 __asm__ volatile ( \ 127 "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ 128 "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ 129 \ 130 : [val_ld_m] "=&r" (val_ld_m) \ 131 : [psrc_ld_m] "r" (psrc_ld_m) \ 132 ); \ 133 \ 134 val_ld_m; \ 135 } ) 136 #else // !(__mips == 64) 137 #define LD(psrc) \ 138 ( { \ 139 uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ 140 uint32_t val0_ld_m, val1_ld_m; \ 141 uint64_t val_ld_m = 0; \ 142 \ 143 val0_ld_m = LW(psrc_ld_m); \ 144 val1_ld_m = LW(psrc_ld_m + 4); \ 145 \ 146 val_ld_m = (uint64_t) (val1_ld_m); \ 147 val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \ 148 val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \ 149 \ 150 val_ld_m; \ 151 } ) 152 #endif // (__mips == 64) 153 154 #define SH(val, pdst) \ 155 { \ 156 uint8_t *pdst_sh_m = (uint8_t *) (pdst); \ 157 uint16_t val_sh_m = (val); \ 158 \ 159 __asm__ volatile ( \ 160 "ush %[val_sh_m], %[pdst_sh_m] \n\t" \ 161 \ 162 : [pdst_sh_m] "=m" (*pdst_sh_m) \ 163 : [val_sh_m] "r" (val_sh_m) \ 164 ); \ 165 } 166 167 #define SW(val, pdst) \ 168 { \ 169 uint8_t *pdst_sw_m = (uint8_t *) (pdst); \ 170 uint32_t val_sw_m = (val); \ 171 \ 172 __asm__ volatile ( \ 173 "usw %[val_sw_m], %[pdst_sw_m] \n\t" \ 174 \ 175 : [pdst_sw_m] "=m" (*pdst_sw_m) \ 176 : [val_sw_m] "r" (val_sw_m) \ 177 ); \ 178 } 179 180 #define SD(val, pdst) \ 181 { \ 182 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ 183 uint32_t val0_sd_m, val1_sd_m; \ 184 \ 185 val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \ 186 val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \ 187 \ 188 SW(val0_sd_m, pdst_sd_m); \ 189 SW(val1_sd_m, pdst_sd_m + 4); \ 190 } 191 #endif // (__mips_isa_rev >= 6) 192 193 /* Description : Load 4 words with stride 194 Arguments : Inputs - psrc (source pointer to load from) 195 - stride 196 Outputs - out0, out1, out2, out3 197 Details : Loads word in 'out0' from (psrc) 198 Loads word in 'out1' from (psrc + stride) 199 Loads word in 'out2' from (psrc + 2 * stride) 200 Loads word in 'out3' from (psrc + 3 * stride) 201 */ 202 #define LW4(psrc, stride, out0, out1, out2, out3) \ 203 { \ 204 out0 = LW((psrc)); \ 205 out1 = LW((psrc) + stride); \ 206 out2 = LW((psrc) + 2 * stride); \ 207 out3 = LW((psrc) + 3 * stride); \ 208 } 209 210 #define LW2(psrc, stride, out0, out1) \ 211 { \ 212 out0 = LW((psrc)); \ 213 out1 = LW((psrc) + stride); \ 214 } 215 216 /* Description : Load double words with stride 217 Arguments : Inputs - psrc (source pointer to load from) 218 - stride 219 Outputs - out0, out1 220 Details : Loads double word in 'out0' from (psrc) 221 Loads double word in 'out1' from (psrc + stride) 222 */ 223 #define LD2(psrc, stride, out0, out1) \ 224 { \ 225 out0 = LD((psrc)); \ 226 out1 = LD((psrc) + stride); \ 227 } 228 #define LD4(psrc, stride, out0, out1, out2, out3) \ 229 { \ 230 LD2((psrc), stride, out0, out1); \ 231 LD2((psrc) + 2 * stride, stride, out2, out3); \ 232 } 233 234 /* Description : Store 4 words with stride 235 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 236 Details : Stores word from 'in0' to (pdst) 237 Stores word from 'in1' to (pdst + stride) 238 Stores word from 'in2' to (pdst + 2 * stride) 239 Stores word from 'in3' to (pdst + 3 * stride) 240 */ 241 #define SW4(in0, in1, in2, in3, pdst, stride) \ 242 { \ 243 SW(in0, (pdst)) \ 244 SW(in1, (pdst) + stride); \ 245 SW(in2, (pdst) + 2 * stride); \ 246 SW(in3, (pdst) + 3 * stride); \ 247 } 248 249 /* Description : Store 4 double words with stride 250 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 251 Details : Stores double word from 'in0' to (pdst) 252 Stores double word from 'in1' to (pdst + stride) 253 Stores double word from 'in2' to (pdst + 2 * stride) 254 Stores double word from 'in3' to (pdst + 3 * stride) 255 */ 256 #define SD4(in0, in1, in2, in3, pdst, stride) \ 257 { \ 258 SD(in0, (pdst)) \ 259 SD(in1, (pdst) + stride); \ 260 SD(in2, (pdst) + 2 * stride); \ 261 SD(in3, (pdst) + 3 * stride); \ 262 } 263 264 /* Description : Load vector elements with stride 265 Arguments : Inputs - psrc (source pointer to load from) 266 - stride 267 Outputs - out0, out1 268 Return Type - as per RTYPE 269 Details : Loads elements in 'out0' from (psrc) 270 Loads elements in 'out1' from (psrc + stride) 271 */ 272 #define LD_V2(RTYPE, psrc, stride, out0, out1) \ 273 { \ 274 out0 = LD_V(RTYPE, (psrc)); \ 275 out1 = LD_V(RTYPE, (psrc) + stride); \ 276 } 277 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__) 278 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__) 279 #define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__) 280 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__) 281 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__) 282 283 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \ 284 { \ 285 LD_V2(RTYPE, (psrc), stride, out0, out1); \ 286 out2 = LD_V(RTYPE, (psrc) + 2 * stride); \ 287 } 288 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__) 289 #define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__) 290 291 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 292 { \ 293 LD_V2(RTYPE, (psrc), stride, out0, out1); \ 294 LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ 295 } 296 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__) 297 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__) 298 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__) 299 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__) 300 #define LD_SW4(...) LD_V4(v4i32, __VA_ARGS__) 301 302 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ 303 { \ 304 LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 305 out4 = LD_V(RTYPE, (psrc) + 4 * stride); \ 306 } 307 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__) 308 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__) 309 310 #define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \ 311 { \ 312 LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 313 LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \ 314 } 315 #define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__) 316 #define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__) 317 #define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__) 318 #define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__) 319 320 #define LD_V7(RTYPE, psrc, stride, \ 321 out0, out1, out2, out3, out4, out5, out6) \ 322 { \ 323 LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ 324 LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ 325 } 326 #define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__) 327 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__) 328 329 #define LD_V8(RTYPE, psrc, stride, \ 330 out0, out1, out2, out3, out4, out5, out6, out7) \ 331 { \ 332 LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 333 LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ 334 } 335 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__) 336 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__) 337 #define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__) 338 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__) 339 #define LD_SW8(...) LD_V8(v4i32, __VA_ARGS__) 340 341 #define LD_V16(RTYPE, psrc, stride, \ 342 out0, out1, out2, out3, out4, out5, out6, out7, \ 343 out8, out9, out10, out11, out12, out13, out14, out15) \ 344 { \ 345 LD_V8(RTYPE, (psrc), stride, \ 346 out0, out1, out2, out3, out4, out5, out6, out7); \ 347 LD_V8(RTYPE, (psrc) + 8 * stride, stride, \ 348 out8, out9, out10, out11, out12, out13, out14, out15); \ 349 } 350 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__) 351 352 /* Description : Store vectors with stride 353 Arguments : Inputs - in0, in1, stride 354 Outputs - pdst (destination pointer to store to) 355 Details : Stores elements from 'in0' to (pdst) 356 Stores elements from 'in1' to (pdst + stride) 357 */ 358 #define ST_V2(RTYPE, in0, in1, pdst, stride) \ 359 { \ 360 ST_V(RTYPE, in0, (pdst)); \ 361 ST_V(RTYPE, in1, (pdst) + stride); \ 362 } 363 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__) 364 #define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__) 365 #define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__) 366 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__) 367 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__) 368 369 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \ 370 { \ 371 ST_V2(RTYPE, in0, in1, (pdst), stride); \ 372 ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 373 } 374 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__) 375 #define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__) 376 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__) 377 #define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__) 378 379 #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \ 380 { \ 381 ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ 382 ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \ 383 } 384 #define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__) 385 386 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 387 { \ 388 ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ 389 ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ 390 } 391 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__) 392 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__) 393 #define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__) 394 395 /* Description : Store half word elements of vector with stride 396 * Arguments : Inputs - in source vector 397 * - pdst (destination pointer to store to) 398 * - stride 399 * Details : Stores half word 'idx0' from 'in' to (pdst) 400 * Stores half word 'idx1' from 'in' to (pdst + stride) 401 * Similar for other elements 402 */ 403 #define ST_H1(in, idx, pdst) \ 404 { \ 405 uint16_t out0_m; \ 406 out0_m = __msa_copy_u_h((v8i16) in, idx); \ 407 SH(out0_m, (pdst)); \ 408 } 409 #define ST_H2(in, idx0, idx1, pdst, stride) \ 410 { \ 411 uint16_t out0_m, out1_m; \ 412 out0_m = __msa_copy_u_h((v8i16) in, idx0); \ 413 out1_m = __msa_copy_u_h((v8i16) in, idx1); \ 414 SH(out0_m, (pdst)); \ 415 SH(out1_m, (pdst) + stride); \ 416 } 417 #define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \ 418 { \ 419 uint16_t out0_m, out1_m, out2_m, out3_m; \ 420 out0_m = __msa_copy_u_h((v8i16) in, idx0); \ 421 out1_m = __msa_copy_u_h((v8i16) in, idx1); \ 422 out2_m = __msa_copy_u_h((v8i16) in, idx2); \ 423 out3_m = __msa_copy_u_h((v8i16) in, idx3); \ 424 SH(out0_m, (pdst)); \ 425 SH(out1_m, (pdst) + stride); \ 426 SH(out2_m, (pdst) + 2 * stride); \ 427 SH(out3_m, (pdst) + 3 * stride); \ 428 } 429 #define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, \ 430 idx6, idx7, pdst, stride) \ 431 { \ 432 ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \ 433 ST_H4(in, idx4, idx5, idx6, idx7, (pdst) + 4*stride, stride) \ 434 } 435 436 /* Description : Store word elements of vector with stride 437 * Arguments : Inputs - in source vector 438 * - pdst (destination pointer to store to) 439 * - stride 440 * Details : Stores word 'idx0' from 'in' to (pdst) 441 * Stores word 'idx1' from 'in' to (pdst + stride) 442 * Similar for other elements 443 */ 444 #define ST_W1(in, idx, pdst) \ 445 { \ 446 uint32_t out0_m; \ 447 out0_m = __msa_copy_u_w((v4i32) in, idx); \ 448 SW(out0_m, (pdst)); \ 449 } 450 #define ST_W2(in, idx0, idx1, pdst, stride) \ 451 { \ 452 uint32_t out0_m, out1_m; \ 453 out0_m = __msa_copy_u_w((v4i32) in, idx0); \ 454 out1_m = __msa_copy_u_w((v4i32) in, idx1); \ 455 SW(out0_m, (pdst)); \ 456 SW(out1_m, (pdst) + stride); \ 457 } 458 #define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride) \ 459 { \ 460 uint32_t out0_m, out1_m, out2_m, out3_m; \ 461 out0_m = __msa_copy_u_w((v4i32) in, idx0); \ 462 out1_m = __msa_copy_u_w((v4i32) in, idx1); \ 463 out2_m = __msa_copy_u_w((v4i32) in, idx2); \ 464 out3_m = __msa_copy_u_w((v4i32) in, idx3); \ 465 SW(out0_m, (pdst)); \ 466 SW(out1_m, (pdst) + stride); \ 467 SW(out2_m, (pdst) + 2*stride); \ 468 SW(out3_m, (pdst) + 3*stride); \ 469 } 470 #define ST_W8(in0, in1, idx0, idx1, idx2, idx3, \ 471 idx4, idx5, idx6, idx7, pdst, stride) \ 472 { \ 473 ST_W4(in0, idx0, idx1, idx2, idx3, pdst, stride) \ 474 ST_W4(in1, idx4, idx5, idx6, idx7, pdst + 4*stride, stride) \ 475 } 476 477 /* Description : Store double word elements of vector with stride 478 * Arguments : Inputs - in source vector 479 * - pdst (destination pointer to store to) 480 * - stride 481 * Details : Stores double word 'idx0' from 'in' to (pdst) 482 * Stores double word 'idx1' from 'in' to (pdst + stride) 483 * Similar for other elements 484 */ 485 #define ST_D1(in, idx, pdst) \ 486 { \ 487 uint64_t out0_m; \ 488 out0_m = __msa_copy_u_d((v2i64) in, idx); \ 489 SD(out0_m, (pdst)); \ 490 } 491 #define ST_D2(in, idx0, idx1, pdst, stride) \ 492 { \ 493 uint64_t out0_m, out1_m; \ 494 out0_m = __msa_copy_u_d((v2i64) in, idx0); \ 495 out1_m = __msa_copy_u_d((v2i64) in, idx1); \ 496 SD(out0_m, (pdst)); \ 497 SD(out1_m, (pdst) + stride); \ 498 } 499 #define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ 500 { \ 501 uint64_t out0_m, out1_m, out2_m, out3_m; \ 502 out0_m = __msa_copy_u_d((v2i64) in0, idx0); \ 503 out1_m = __msa_copy_u_d((v2i64) in0, idx1); \ 504 out2_m = __msa_copy_u_d((v2i64) in1, idx2); \ 505 out3_m = __msa_copy_u_d((v2i64) in1, idx3); \ 506 SD(out0_m, (pdst)); \ 507 SD(out1_m, (pdst) + stride); \ 508 SD(out2_m, (pdst) + 2 * stride); \ 509 SD(out3_m, (pdst) + 3 * stride); \ 510 } 511 #define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, \ 512 idx4, idx5, idx6, idx7, pdst, stride) \ 513 { \ 514 ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ 515 ST_D4(in2, in3, idx4, idx5, idx6, idx7, pdst + 4 * stride, stride) \ 516 } 517 518 /* Description : Store as 12x8 byte block to destination memory from 519 input vectors 520 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride 521 Details : Index 0 double word element from input vector 'in0' is copied 522 and stored to destination memory at (pblk_12x8_m) followed by 523 index 2 word element from same input vector 'in0' at 524 (pblk_12x8_m + 8) 525 Similar to remaining lines 526 */ 527 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 528 { \ 529 uint64_t out0_m, out1_m, out2_m, out3_m; \ 530 uint64_t out4_m, out5_m, out6_m, out7_m; \ 531 uint32_t out8_m, out9_m, out10_m, out11_m; \ 532 uint32_t out12_m, out13_m, out14_m, out15_m; \ 533 uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \ 534 \ 535 out0_m = __msa_copy_u_d((v2i64) in0, 0); \ 536 out1_m = __msa_copy_u_d((v2i64) in1, 0); \ 537 out2_m = __msa_copy_u_d((v2i64) in2, 0); \ 538 out3_m = __msa_copy_u_d((v2i64) in3, 0); \ 539 out4_m = __msa_copy_u_d((v2i64) in4, 0); \ 540 out5_m = __msa_copy_u_d((v2i64) in5, 0); \ 541 out6_m = __msa_copy_u_d((v2i64) in6, 0); \ 542 out7_m = __msa_copy_u_d((v2i64) in7, 0); \ 543 \ 544 out8_m = __msa_copy_u_w((v4i32) in0, 2); \ 545 out9_m = __msa_copy_u_w((v4i32) in1, 2); \ 546 out10_m = __msa_copy_u_w((v4i32) in2, 2); \ 547 out11_m = __msa_copy_u_w((v4i32) in3, 2); \ 548 out12_m = __msa_copy_u_w((v4i32) in4, 2); \ 549 out13_m = __msa_copy_u_w((v4i32) in5, 2); \ 550 out14_m = __msa_copy_u_w((v4i32) in6, 2); \ 551 out15_m = __msa_copy_u_w((v4i32) in7, 2); \ 552 \ 553 SD(out0_m, pblk_12x8_m); \ 554 SW(out8_m, pblk_12x8_m + 8); \ 555 pblk_12x8_m += stride; \ 556 SD(out1_m, pblk_12x8_m); \ 557 SW(out9_m, pblk_12x8_m + 8); \ 558 pblk_12x8_m += stride; \ 559 SD(out2_m, pblk_12x8_m); \ 560 SW(out10_m, pblk_12x8_m + 8); \ 561 pblk_12x8_m += stride; \ 562 SD(out3_m, pblk_12x8_m); \ 563 SW(out11_m, pblk_12x8_m + 8); \ 564 pblk_12x8_m += stride; \ 565 SD(out4_m, pblk_12x8_m); \ 566 SW(out12_m, pblk_12x8_m + 8); \ 567 pblk_12x8_m += stride; \ 568 SD(out5_m, pblk_12x8_m); \ 569 SW(out13_m, pblk_12x8_m + 8); \ 570 pblk_12x8_m += stride; \ 571 SD(out6_m, pblk_12x8_m); \ 572 SW(out14_m, pblk_12x8_m + 8); \ 573 pblk_12x8_m += stride; \ 574 SD(out7_m, pblk_12x8_m); \ 575 SW(out15_m, pblk_12x8_m + 8); \ 576 } 577 578 /* Description : average with rounding (in0 + in1 + 1) / 2. 579 Arguments : Inputs - in0, in1, in2, in3, 580 Outputs - out0, out1 581 Return Type - as per RTYPE 582 Details : Each byte element from 'in0' vector is added with each byte 583 element from 'in1' vector. The addition of the elements plus 1 584 (for rounding) is done unsigned with full precision, 585 i.e. the result has one extra bit. Unsigned division by 2 586 (or logical shift right by one bit) is performed before writing 587 the result to vector 'out0' 588 Similar for the pair of 'in2' and 'in3' 589 */ 590 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ 591 { \ 592 out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1); \ 593 out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3); \ 594 } 595 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) 596 597 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 598 out0, out1, out2, out3) \ 599 { \ 600 AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ 601 AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ 602 } 603 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) 604 605 /* Description : Immediate number of columns to slide 606 Arguments : Inputs - s, d, slide_val 607 Outputs - out 608 Return Type - as per RTYPE 609 Details : Byte elements from 'd' vector are slide into 's' by 610 number of elements specified by 'slide_val' 611 */ 612 #define SLDI_B(RTYPE, d, s, slide_val, out) \ 613 { \ 614 out = (RTYPE) __msa_sldi_b((v16i8) d, (v16i8) s, slide_val); \ 615 } 616 617 #define SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \ 618 { \ 619 SLDI_B(RTYPE, d0, s0, slide_val, out0) \ 620 SLDI_B(RTYPE, d1, s1, slide_val, out1) \ 621 } 622 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) 623 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__) 624 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) 625 #define SLDI_B2_SW(...) SLDI_B2(v4i32, __VA_ARGS__) 626 627 #define SLDI_B3(RTYPE, d0, s0, d1, s1, d2, s2, slide_val, \ 628 out0, out1, out2) \ 629 { \ 630 SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \ 631 SLDI_B(RTYPE, d2, s2, slide_val, out2) \ 632 } 633 #define SLDI_B3_UB(...) SLDI_B3(v16u8, __VA_ARGS__) 634 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) 635 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) 636 637 #define SLDI_B4(RTYPE, d0, s0, d1, s1, d2, s2, d3, s3, \ 638 slide_val, out0, out1, out2, out3) \ 639 { \ 640 SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \ 641 SLDI_B2(RTYPE, d2, s2, d3, s3, slide_val, out2, out3) \ 642 } 643 #define SLDI_B4_UB(...) SLDI_B4(v16u8, __VA_ARGS__) 644 #define SLDI_B4_SB(...) SLDI_B4(v16i8, __VA_ARGS__) 645 #define SLDI_B4_SH(...) SLDI_B4(v8i16, __VA_ARGS__) 646 647 /* Description : Shuffle byte vector elements as per mask vector 648 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 649 Outputs - out0, out1 650 Return Type - as per RTYPE 651 Details : Selective byte elements from in0 & in1 are copied to out0 as 652 per control vector mask0 653 Selective byte elements from in2 & in3 are copied to out1 as 654 per control vector mask1 655 */ 656 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 657 { \ 658 out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \ 659 out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \ 660 } 661 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) 662 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) 663 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) 664 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__) 665 666 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \ 667 out0, out1, out2) \ 668 { \ 669 VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \ 670 out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \ 671 } 672 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__) 673 674 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \ 675 out0, out1, out2, out3) \ 676 { \ 677 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ 678 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ 679 } 680 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) 681 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) 682 683 /* Description : Shuffle halfword vector elements as per mask vector 684 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 685 Outputs - out0, out1 686 Return Type - as per RTYPE 687 Details : Selective halfword elements from in0 & in1 are copied to out0 688 as per control vector mask0 689 Selective halfword elements from in2 & in3 are copied to out1 690 as per control vector mask1 691 */ 692 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 693 { \ 694 out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \ 695 out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \ 696 } 697 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__) 698 699 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \ 700 out0, out1, out2) \ 701 { \ 702 VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \ 703 out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4); \ 704 } 705 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__) 706 707 /* Description : Shuffle byte vector elements as per mask vector 708 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 709 Outputs - out0, out1 710 Return Type - as per RTYPE 711 Details : Selective byte elements from in0 & in1 are copied to out0 as 712 per control vector mask0 713 Selective byte elements from in2 & in3 are copied to out1 as 714 per control vector mask1 715 */ 716 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 717 { \ 718 out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \ 719 out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \ 720 } 721 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__) 722 723 /* Description : Dot product of byte vector elements 724 Arguments : Inputs - mult0, mult1 725 cnst0, cnst1 726 Outputs - out0, out1 727 Return Type - as per RTYPE 728 Details : Unsigned byte elements from mult0 are multiplied with 729 unsigned byte elements from cnst0 producing a result 730 twice the size of input i.e. unsigned halfword. 731 Then this multiplication results of adjacent odd-even elements 732 are added together and stored to the out vector 733 (2 unsigned halfword results) 734 */ 735 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 736 { \ 737 out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0); \ 738 out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1); \ 739 } 740 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) 741 742 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \ 743 cnst0, cnst1, cnst2, cnst3, \ 744 out0, out1, out2, out3) \ 745 { \ 746 DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 747 DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 748 } 749 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) 750 751 /* Description : Dot product of byte vector elements 752 Arguments : Inputs - mult0, mult1 753 cnst0, cnst1 754 Outputs - out0, out1 755 Return Type - as per RTYPE 756 Details : Signed byte elements from mult0 are multiplied with 757 signed byte elements from cnst0 producing a result 758 twice the size of input i.e. signed halfword. 759 Then this multiplication results of adjacent odd-even elements 760 are added together and stored to the out vector 761 (2 signed halfword results) 762 */ 763 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 764 { \ 765 out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \ 766 out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \ 767 } 768 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) 769 770 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \ 771 out0, out1, out2) \ 772 { \ 773 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 774 out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \ 775 } 776 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__) 777 778 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \ 779 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \ 780 { \ 781 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 782 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 783 } 784 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) 785 786 /* Description : Dot product of halfword vector elements 787 Arguments : Inputs - mult0, mult1 788 cnst0, cnst1 789 Outputs - out0, out1 790 Return Type - as per RTYPE 791 Details : Signed halfword elements from mult0 are multiplied with 792 signed halfword elements from cnst0 producing a result 793 twice the size of input i.e. signed word. 794 Then this multiplication results of adjacent odd-even elements 795 are added together and stored to the out vector 796 (2 signed word results) 797 */ 798 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 799 { \ 800 out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \ 801 out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \ 802 } 803 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) 804 805 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \ 806 cnst0, cnst1, cnst2, cnst3, \ 807 out0, out1, out2, out3) \ 808 { \ 809 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 810 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 811 } 812 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) 813 814 /* Description : Dot product & addition of byte vector elements 815 Arguments : Inputs - mult0, mult1 816 cnst0, cnst1 817 Outputs - out0, out1 818 Return Type - as per RTYPE 819 Details : Signed byte elements from mult0 are multiplied with 820 signed byte elements from cnst0 producing a result 821 twice the size of input i.e. signed halfword. 822 Then this multiplication results of adjacent odd-even elements 823 are added to the out vector 824 (2 signed halfword results) 825 */ 826 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 827 { \ 828 out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \ 829 (v16i8) mult0, (v16i8) cnst0); \ 830 out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \ 831 (v16i8) mult1, (v16i8) cnst1); \ 832 } 833 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) 834 835 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \ 836 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \ 837 { \ 838 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 839 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 840 } 841 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) 842 843 /* Description : Dot product & addition of byte vector elements 844 Arguments : Inputs - mult0, mult1 845 cnst0, cnst1 846 Outputs - out0, out1 847 Return Type - as per RTYPE 848 Details : Unsigned byte elements from mult0 are multiplied with 849 unsigned byte elements from cnst0 producing a result 850 twice the size of input i.e. unsigned halfword. 851 Then this multiplication results of adjacent odd-even elements 852 are added to the out vector 853 (2 unsigned halfword results) 854 */ 855 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 856 { \ 857 out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0, \ 858 (v16u8) mult0, (v16u8) cnst0); \ 859 out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1, \ 860 (v16u8) mult1, (v16u8) cnst1); \ 861 } 862 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__) 863 864 /* Description : Dot product & addition of halfword vector elements 865 Arguments : Inputs - mult0, mult1 866 cnst0, cnst1 867 Outputs - out0, out1 868 Return Type - as per RTYPE 869 Details : Signed halfword elements from mult0 are multiplied with 870 signed halfword elements from cnst0 producing a result 871 twice the size of input i.e. signed word. 872 Then this multiplication results of adjacent odd-even elements 873 are added to the out vector 874 (2 signed word results) 875 */ 876 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 877 { \ 878 out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \ 879 (v8i16) mult0, (v8i16) cnst0); \ 880 out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \ 881 (v8i16) mult1, (v8i16) cnst1); \ 882 } 883 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) 884 885 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \ 886 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \ 887 { \ 888 DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 889 DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 890 } 891 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__) 892 893 /* Description : Minimum values between unsigned elements of 894 either vector are copied to the output vector 895 Arguments : Inputs - in0, in1, min_vec 896 Outputs - in0, in1, (in place) 897 Return Type - as per RTYPE 898 Details : Minimum of unsigned halfword element values from 'in0' and 899 'min_value' are written to output vector 'in0' 900 */ 901 #define MIN_UH2(RTYPE, in0, in1, min_vec) \ 902 { \ 903 in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec); \ 904 in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec); \ 905 } 906 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) 907 908 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \ 909 { \ 910 MIN_UH2(RTYPE, in0, in1, min_vec); \ 911 MIN_UH2(RTYPE, in2, in3, min_vec); \ 912 } 913 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) 914 915 /* Description : Clips all halfword elements of input vector between min & max 916 out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in)) 917 Arguments : Inputs - in (input vector) 918 - min (min threshold) 919 - max (max threshold) 920 Outputs - in (output vector with clipped elements) 921 Return Type - signed halfword 922 */ 923 #define CLIP_SH(in, min, max) \ 924 { \ 925 in = __msa_max_s_h((v8i16) min, (v8i16) in); \ 926 in = __msa_min_s_h((v8i16) max, (v8i16) in); \ 927 } 928 929 /* Description : Clips all signed halfword elements of input vector 930 between 0 & 255 931 Arguments : Inputs - in (input vector) 932 Outputs - in (output vector with clipped elements) 933 Return Type - signed halfwords 934 */ 935 #define CLIP_SH_0_255(in) \ 936 { \ 937 in = __msa_maxi_s_h((v8i16) in, 0); \ 938 in = (v8i16) __msa_sat_u_h((v8u16) in, 7); \ 939 } 940 941 #define CLIP_SH2_0_255(in0, in1) \ 942 { \ 943 CLIP_SH_0_255(in0); \ 944 CLIP_SH_0_255(in1); \ 945 } 946 947 #define CLIP_SH4_0_255(in0, in1, in2, in3) \ 948 { \ 949 CLIP_SH2_0_255(in0, in1); \ 950 CLIP_SH2_0_255(in2, in3); \ 951 } 952 953 #define CLIP_SH8_0_255(in0, in1, in2, in3, \ 954 in4, in5, in6, in7) \ 955 { \ 956 CLIP_SH4_0_255(in0, in1, in2, in3); \ 957 CLIP_SH4_0_255(in4, in5, in6, in7); \ 958 } 959 960 /* Description : Clips all signed word elements of input vector 961 between 0 & 255 962 Arguments : Inputs - in (input vector) 963 Outputs - in (output vector with clipped elements) 964 Return Type - signed word 965 */ 966 #define CLIP_SW_0_255(in) \ 967 { \ 968 in = __msa_maxi_s_w((v4i32) in, 0); \ 969 in = (v4i32) __msa_sat_u_w((v4u32) in, 7); \ 970 } 971 972 #define CLIP_SW2_0_255(in0, in1) \ 973 { \ 974 CLIP_SW_0_255(in0); \ 975 CLIP_SW_0_255(in1); \ 976 } 977 978 #define CLIP_SW4_0_255(in0, in1, in2, in3) \ 979 { \ 980 CLIP_SW2_0_255(in0, in1); \ 981 CLIP_SW2_0_255(in2, in3); \ 982 } 983 984 #define CLIP_SW8_0_255(in0, in1, in2, in3, \ 985 in4, in5, in6, in7) \ 986 { \ 987 CLIP_SW4_0_255(in0, in1, in2, in3); \ 988 CLIP_SW4_0_255(in4, in5, in6, in7); \ 989 } 990 991 /* Description : Addition of 4 signed word elements 992 4 signed word elements of input vector are added together and 993 resulted integer sum is returned 994 Arguments : Inputs - in (signed word vector) 995 Outputs - sum_m (i32 sum) 996 Return Type - signed word 997 */ 998 #define HADD_SW_S32(in) \ 999 ( { \ 1000 v2i64 res0_m, res1_m; \ 1001 int32_t sum_m; \ 1002 \ 1003 res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \ 1004 res1_m = __msa_splati_d(res0_m, 1); \ 1005 res0_m += res1_m; \ 1006 sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \ 1007 sum_m; \ 1008 } ) 1009 1010 /* Description : Addition of 8 unsigned halfword elements 1011 8 unsigned halfword elements of input vector are added 1012 together and resulted integer sum is returned 1013 Arguments : Inputs - in (unsigned halfword vector) 1014 Outputs - sum_m (u32 sum) 1015 Return Type - unsigned word 1016 */ 1017 #define HADD_UH_U32(in) \ 1018 ( { \ 1019 v4u32 res_m; \ 1020 v2u64 res0_m, res1_m; \ 1021 uint32_t sum_m; \ 1022 \ 1023 res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \ 1024 res0_m = __msa_hadd_u_d(res_m, res_m); \ 1025 res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \ 1026 res0_m += res1_m; \ 1027 sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \ 1028 sum_m; \ 1029 } ) 1030 1031 /* Description : Horizontal addition of signed byte vector elements 1032 Arguments : Inputs - in0, in1 1033 Outputs - out0, out1 1034 Return Type - as per RTYPE 1035 Details : Each signed odd byte element from 'in0' is added to 1036 even signed byte element from 'in0' (pairwise) and the 1037 halfword result is stored in 'out0' 1038 */ 1039 #define HADD_SB2(RTYPE, in0, in1, out0, out1) \ 1040 { \ 1041 out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0); \ 1042 out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1); \ 1043 } 1044 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__) 1045 1046 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ 1047 { \ 1048 HADD_SB2(RTYPE, in0, in1, out0, out1); \ 1049 HADD_SB2(RTYPE, in2, in3, out2, out3); \ 1050 } 1051 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__) 1052 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__) 1053 1054 /* Description : Horizontal addition of unsigned byte vector elements 1055 Arguments : Inputs - in0, in1 1056 Outputs - out0, out1 1057 Return Type - as per RTYPE 1058 Details : Each unsigned odd byte element from 'in0' is added to 1059 even unsigned byte element from 'in0' (pairwise) and the 1060 halfword result is stored in 'out0' 1061 */ 1062 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \ 1063 { \ 1064 out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0); \ 1065 out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1); \ 1066 } 1067 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) 1068 1069 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2) \ 1070 { \ 1071 HADD_UB2(RTYPE, in0, in1, out0, out1); \ 1072 out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2); \ 1073 } 1074 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__) 1075 1076 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ 1077 { \ 1078 HADD_UB2(RTYPE, in0, in1, out0, out1); \ 1079 HADD_UB2(RTYPE, in2, in3, out2, out3); \ 1080 } 1081 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__) 1082 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) 1083 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__) 1084 1085 /* Description : Horizontal subtraction of unsigned byte vector elements 1086 Arguments : Inputs - in0, in1 1087 Outputs - out0, out1 1088 Return Type - as per RTYPE 1089 Details : Each unsigned odd byte element from 'in0' is subtracted from 1090 even unsigned byte element from 'in0' (pairwise) and the 1091 halfword result is stored in 'out0' 1092 */ 1093 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ 1094 { \ 1095 out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \ 1096 out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \ 1097 } 1098 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__) 1099 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) 1100 1101 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ 1102 { \ 1103 HSUB_UB2(RTYPE, in0, in1, out0, out1); \ 1104 HSUB_UB2(RTYPE, in2, in3, out2, out3); \ 1105 } 1106 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__) 1107 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__) 1108 1109 /* Description : SAD (Sum of Absolute Difference) 1110 Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref) 1111 Outputs - sad_m (halfword vector with sad) 1112 Return Type - unsigned halfword 1113 Details : Absolute difference of all the byte elements from 'in0' with 1114 'ref0' is calculated and preserved in 'diff0'. From the 16 1115 unsigned absolute diff values, even-odd pairs are added 1116 together to generate 8 halfword results. 1117 */ 1118 #define SAD_UB2_UH(in0, in1, ref0, ref1) \ 1119 ( { \ 1120 v16u8 diff0_m, diff1_m; \ 1121 v8u16 sad_m = { 0 }; \ 1122 \ 1123 diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \ 1124 diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \ 1125 \ 1126 sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \ 1127 sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \ 1128 \ 1129 sad_m; \ 1130 } ) 1131 1132 /* Description : Insert specified word elements from input vectors to 1 1133 destination vector 1134 Arguments : Inputs - in0, in1, in2, in3 (4 input vectors) 1135 Outputs - out (output vector) 1136 Return Type - as per RTYPE 1137 */ 1138 #define INSERT_W2(RTYPE, in0, in1, out) \ 1139 { \ 1140 out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \ 1141 out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \ 1142 } 1143 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__) 1144 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) 1145 1146 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \ 1147 { \ 1148 out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \ 1149 out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \ 1150 out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2); \ 1151 out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3); \ 1152 } 1153 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) 1154 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) 1155 #define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__) 1156 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__) 1157 1158 /* Description : Insert specified double word elements from input vectors to 1 1159 destination vector 1160 Arguments : Inputs - in0, in1 (2 input vectors) 1161 Outputs - out (output vector) 1162 Return Type - as per RTYPE 1163 */ 1164 #define INSERT_D2(RTYPE, in0, in1, out) \ 1165 { \ 1166 out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0); \ 1167 out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1); \ 1168 } 1169 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) 1170 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) 1171 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__) 1172 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__) 1173 1174 /* Description : Interleave even byte elements from vectors 1175 Arguments : Inputs - in0, in1, in2, in3 1176 Outputs - out0, out1 1177 Return Type - as per RTYPE 1178 Details : Even byte elements of 'in0' and even byte 1179 elements of 'in1' are interleaved and copied to 'out0' 1180 Even byte elements of 'in2' and even byte 1181 elements of 'in3' are interleaved and copied to 'out1' 1182 */ 1183 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1184 { \ 1185 out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \ 1186 out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \ 1187 } 1188 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) 1189 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__) 1190 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) 1191 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__) 1192 1193 /* Description : Interleave even halfword elements from vectors 1194 Arguments : Inputs - in0, in1, in2, in3 1195 Outputs - out0, out1 1196 Return Type - as per RTYPE 1197 Details : Even halfword elements of 'in0' and even halfword 1198 elements of 'in1' are interleaved and copied to 'out0' 1199 Even halfword elements of 'in2' and even halfword 1200 elements of 'in3' are interleaved and copied to 'out1' 1201 */ 1202 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1203 { \ 1204 out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \ 1205 out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \ 1206 } 1207 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) 1208 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) 1209 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) 1210 1211 /* Description : Interleave even word elements from vectors 1212 Arguments : Inputs - in0, in1, in2, in3 1213 Outputs - out0, out1 1214 Return Type - as per RTYPE 1215 Details : Even word elements of 'in0' and even word 1216 elements of 'in1' are interleaved and copied to 'out0' 1217 Even word elements of 'in2' and even word 1218 elements of 'in3' are interleaved and copied to 'out1' 1219 */ 1220 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1221 { \ 1222 out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \ 1223 out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \ 1224 } 1225 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__) 1226 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) 1227 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__) 1228 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__) 1229 1230 /* Description : Interleave even double word elements from vectors 1231 Arguments : Inputs - in0, in1, in2, in3 1232 Outputs - out0, out1 1233 Return Type - as per RTYPE 1234 Details : Even double word elements of 'in0' and even double word 1235 elements of 'in1' are interleaved and copied to 'out0' 1236 Even double word elements of 'in2' and even double word 1237 elements of 'in3' are interleaved and copied to 'out1' 1238 */ 1239 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1240 { \ 1241 out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \ 1242 out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \ 1243 } 1244 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) 1245 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__) 1246 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__) 1247 1248 /* Description : Interleave left half of byte elements from vectors 1249 Arguments : Inputs - in0, in1, in2, in3 1250 Outputs - out0, out1 1251 Return Type - as per RTYPE 1252 Details : Left half of byte elements of in0 and left half of byte 1253 elements of in1 are interleaved and copied to out0. 1254 Left half of byte elements of in2 and left half of byte 1255 elements of in3 are interleaved and copied to out1. 1256 */ 1257 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1258 { \ 1259 out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \ 1260 out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \ 1261 } 1262 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) 1263 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) 1264 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) 1265 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) 1266 1267 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1268 out0, out1, out2, out3) \ 1269 { \ 1270 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1271 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1272 } 1273 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__) 1274 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) 1275 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) 1276 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) 1277 1278 /* Description : Interleave left half of halfword elements from vectors 1279 Arguments : Inputs - in0, in1, in2, in3 1280 Outputs - out0, out1 1281 Return Type - as per RTYPE 1282 Details : Left half of halfword elements of in0 and left half of halfword 1283 elements of in1 are interleaved and copied to out0. 1284 Left half of halfword elements of in2 and left half of halfword 1285 elements of in3 are interleaved and copied to out1. 1286 */ 1287 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1288 { \ 1289 out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \ 1290 out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \ 1291 } 1292 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) 1293 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) 1294 1295 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1296 out0, out1, out2, out3) \ 1297 { \ 1298 ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1299 ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1300 } 1301 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__) 1302 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__) 1303 1304 /* Description : Interleave left half of word elements from vectors 1305 Arguments : Inputs - in0, in1, in2, in3 1306 Outputs - out0, out1 1307 Return Type - as per RTYPE 1308 Details : Left half of word elements of in0 and left half of word 1309 elements of in1 are interleaved and copied to out0. 1310 Left half of word elements of in2 and left half of word 1311 elements of in3 are interleaved and copied to out1. 1312 */ 1313 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1314 { \ 1315 out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \ 1316 out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \ 1317 } 1318 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) 1319 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__) 1320 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) 1321 1322 /* Description : Interleave right half of byte elements from vectors 1323 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1324 Outputs - out0, out1, out2, out3 1325 Return Type - as per RTYPE 1326 Details : Right half of byte elements of in0 and right half of byte 1327 elements of in1 are interleaved and copied to out0. 1328 Right half of byte elements of in2 and right half of byte 1329 elements of in3 are interleaved and copied to out1. 1330 Similar for other pairs 1331 */ 1332 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1333 { \ 1334 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \ 1335 out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \ 1336 } 1337 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) 1338 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) 1339 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) 1340 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) 1341 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__) 1342 1343 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ 1344 { \ 1345 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1346 out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \ 1347 } 1348 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__) 1349 #define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__) 1350 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__) 1351 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__) 1352 1353 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1354 out0, out1, out2, out3) \ 1355 { \ 1356 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1357 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1358 } 1359 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) 1360 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) 1361 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) 1362 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) 1363 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__) 1364 1365 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1366 in8, in9, in10, in11, in12, in13, in14, in15, \ 1367 out0, out1, out2, out3, out4, out5, out6, out7) \ 1368 { \ 1369 ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1370 out0, out1, out2, out3); \ 1371 ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \ 1372 out4, out5, out6, out7); \ 1373 } 1374 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) 1375 #define ILVR_B8_SW(...) ILVR_B8(v4i32, __VA_ARGS__) 1376 1377 /* Description : Interleave right half of halfword elements from vectors 1378 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1379 Outputs - out0, out1, out2, out3 1380 Return Type - as per RTYPE 1381 Details : Right half of halfword elements of in0 and right half of 1382 halfword elements of in1 are interleaved and copied to out0. 1383 Right half of halfword elements of in2 and right half of 1384 halfword elements of in3 are interleaved and copied to out1. 1385 Similar for other pairs 1386 */ 1387 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1388 { \ 1389 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \ 1390 out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \ 1391 } 1392 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) 1393 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) 1394 1395 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ 1396 { \ 1397 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1398 out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \ 1399 } 1400 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__) 1401 1402 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1403 out0, out1, out2, out3) \ 1404 { \ 1405 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1406 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1407 } 1408 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) 1409 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__) 1410 1411 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1412 { \ 1413 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \ 1414 out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \ 1415 } 1416 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) 1417 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__) 1418 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) 1419 1420 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1421 out0, out1, out2, out3) \ 1422 { \ 1423 ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1424 ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1425 } 1426 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__) 1427 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) 1428 1429 /* Description : Interleave right half of double word elements from vectors 1430 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1431 Outputs - out0, out1, out2, out3 1432 Return Type - as per RTYPE 1433 Details : Right half of double word elements of in0 and right half of 1434 double word elements of in1 are interleaved and copied to out0. 1435 Right half of double word elements of in2 and right half of 1436 double word elements of in3 are interleaved and copied to out1. 1437 */ 1438 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1439 { \ 1440 out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \ 1441 out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3); \ 1442 } 1443 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) 1444 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) 1445 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) 1446 1447 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ 1448 { \ 1449 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1450 out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5); \ 1451 } 1452 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) 1453 1454 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1455 out0, out1, out2, out3) \ 1456 { \ 1457 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1458 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1459 } 1460 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) 1461 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) 1462 1463 /* Description : Interleave left half of double word elements from vectors 1464 Arguments : Inputs - in0, in1, in2, in3 1465 Outputs - out0, out1 1466 Return Type - as per RTYPE 1467 Details : Left half of double word elements of in0 and left half of 1468 double word elements of in1 are interleaved and copied to out0. 1469 Left half of double word elements of in2 and left half of 1470 double word elements of in3 are interleaved and copied to out1. 1471 */ 1472 #define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1473 { \ 1474 out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \ 1475 out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3); \ 1476 } 1477 #define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__) 1478 #define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__) 1479 #define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__) 1480 1481 /* Description : Interleave both left and right half of input vectors 1482 Arguments : Inputs - in0, in1 1483 Outputs - out0, out1 1484 Return Type - as per RTYPE 1485 Details : Right half of byte elements from 'in0' and 'in1' are 1486 interleaved and stored to 'out0' 1487 Left half of byte elements from 'in0' and 'in1' are 1488 interleaved and stored to 'out1' 1489 */ 1490 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ 1491 { \ 1492 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \ 1493 out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \ 1494 } 1495 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) 1496 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) 1497 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) 1498 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) 1499 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__) 1500 1501 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ 1502 { \ 1503 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \ 1504 out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \ 1505 } 1506 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__) 1507 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__) 1508 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) 1509 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) 1510 1511 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ 1512 { \ 1513 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \ 1514 out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \ 1515 } 1516 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) 1517 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) 1518 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) 1519 1520 /* Description : Maximum values between signed elements of vector and 1521 5-bit signed immediate value are copied to the output vector 1522 Arguments : Inputs - in0, in1, in2, in3, max_val 1523 Outputs - in0, in1, in2, in3 (in place) 1524 Return Type - as per RTYPE 1525 Details : Maximum of signed halfword element values from 'in0' and 1526 'max_val' are written to output vector 'in0' 1527 */ 1528 #define MAXI_SH2(RTYPE, in0, in1, max_val) \ 1529 { \ 1530 in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val); \ 1531 in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val); \ 1532 } 1533 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__) 1534 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__) 1535 1536 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \ 1537 { \ 1538 MAXI_SH2(RTYPE, in0, in1, max_val); \ 1539 MAXI_SH2(RTYPE, in2, in3, max_val); \ 1540 } 1541 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__) 1542 #define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__) 1543 1544 #define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val) \ 1545 { \ 1546 MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val); \ 1547 MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val); \ 1548 } 1549 #define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__) 1550 #define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__) 1551 1552 /* Description : Saturate the halfword element values to the max 1553 unsigned value of (sat_val+1 bits) 1554 The element data width remains unchanged 1555 Arguments : Inputs - in0, in1, in2, in3, sat_val 1556 Outputs - in0, in1, in2, in3 (in place) 1557 Return Type - as per RTYPE 1558 Details : Each unsigned halfword element from 'in0' is saturated to the 1559 value generated with (sat_val+1) bit range 1560 Results are in placed to original vectors 1561 */ 1562 #define SAT_UH2(RTYPE, in0, in1, sat_val) \ 1563 { \ 1564 in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \ 1565 in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \ 1566 } 1567 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) 1568 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__) 1569 1570 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ 1571 { \ 1572 SAT_UH2(RTYPE, in0, in1, sat_val); \ 1573 SAT_UH2(RTYPE, in2, in3, sat_val); \ 1574 } 1575 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) 1576 #define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__) 1577 1578 #define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val) \ 1579 { \ 1580 SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val); \ 1581 SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val); \ 1582 } 1583 #define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__) 1584 #define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__) 1585 1586 /* Description : Saturate the halfword element values to the max 1587 unsigned value of (sat_val+1 bits) 1588 The element data width remains unchanged 1589 Arguments : Inputs - in0, in1, in2, in3, sat_val 1590 Outputs - in0, in1, in2, in3 (in place) 1591 Return Type - as per RTYPE 1592 Details : Each unsigned halfword element from 'in0' is saturated to the 1593 value generated with (sat_val+1) bit range 1594 Results are in placed to original vectors 1595 */ 1596 #define SAT_SH2(RTYPE, in0, in1, sat_val) \ 1597 { \ 1598 in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \ 1599 in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \ 1600 } 1601 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) 1602 1603 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \ 1604 { \ 1605 SAT_SH2(RTYPE, in0, in1, sat_val); \ 1606 in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \ 1607 } 1608 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__) 1609 1610 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ 1611 { \ 1612 SAT_SH2(RTYPE, in0, in1, sat_val); \ 1613 SAT_SH2(RTYPE, in2, in3, sat_val); \ 1614 } 1615 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) 1616 1617 /* Description : Saturate the word element values to the max 1618 unsigned value of (sat_val+1 bits) 1619 The element data width remains unchanged 1620 Arguments : Inputs - in0, in1, in2, in3, sat_val 1621 Outputs - in0, in1, in2, in3 (in place) 1622 Return Type - as per RTYPE 1623 Details : Each unsigned word element from 'in0' is saturated to the 1624 value generated with (sat_val+1) bit range 1625 Results are in placed to original vectors 1626 */ 1627 #define SAT_SW2(RTYPE, in0, in1, sat_val) \ 1628 { \ 1629 in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val); \ 1630 in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val); \ 1631 } 1632 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__) 1633 1634 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val) \ 1635 { \ 1636 SAT_SW2(RTYPE, in0, in1, sat_val); \ 1637 SAT_SW2(RTYPE, in2, in3, sat_val); \ 1638 } 1639 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__) 1640 1641 /* Description : Indexed halfword element values are replicated to all 1642 elements in output vector 1643 Arguments : Inputs - in, idx0, idx1 1644 Outputs - out0, out1 1645 Return Type - as per RTYPE 1646 Details : 'idx0' element value from 'in' vector is replicated to all 1647 elements in 'out0' vector 1648 Valid index range for halfword operation is 0-7 1649 */ 1650 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ 1651 { \ 1652 out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \ 1653 out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \ 1654 } 1655 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__) 1656 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) 1657 1658 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, \ 1659 out0, out1, out2) \ 1660 { \ 1661 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ 1662 out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2); \ 1663 } 1664 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__) 1665 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__) 1666 1667 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \ 1668 out0, out1, out2, out3) \ 1669 { \ 1670 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ 1671 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ 1672 } 1673 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) 1674 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) 1675 1676 /* Description : Indexed word element values are replicated to all 1677 elements in output vector 1678 Arguments : Inputs - in, stidx 1679 Outputs - out0, out1 1680 Return Type - as per RTYPE 1681 Details : 'stidx' element value from 'in' vector is replicated to all 1682 elements in 'out0' vector 1683 'stidx + 1' element value from 'in' vector is replicated to all 1684 elements in 'out1' vector 1685 Valid index range for halfword operation is 0-3 1686 */ 1687 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \ 1688 { \ 1689 out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \ 1690 out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \ 1691 } 1692 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__) 1693 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__) 1694 1695 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \ 1696 { \ 1697 SPLATI_W2(RTYPE, in, 0, out0, out1); \ 1698 SPLATI_W2(RTYPE, in, 2, out2, out3); \ 1699 } 1700 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__) 1701 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__) 1702 1703 /* Description : Pack even byte elements of vector pairs 1704 Arguments : Inputs - in0, in1, in2, in3 1705 Outputs - out0, out1 1706 Return Type - as per RTYPE 1707 Details : Even byte elements of in0 are copied to the left half of 1708 out0 & even byte elements of in1 are copied to the right 1709 half of out0. 1710 Even byte elements of in2 are copied to the left half of 1711 out1 & even byte elements of in3 are copied to the right 1712 half of out1. 1713 */ 1714 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1715 { \ 1716 out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \ 1717 out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \ 1718 } 1719 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) 1720 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) 1721 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) 1722 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__) 1723 1724 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ 1725 { \ 1726 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1727 out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \ 1728 } 1729 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__) 1730 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__) 1731 1732 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1733 out0, out1, out2, out3) \ 1734 { \ 1735 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1736 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1737 } 1738 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) 1739 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) 1740 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) 1741 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__) 1742 1743 /* Description : Pack even halfword elements of vector pairs 1744 Arguments : Inputs - in0, in1, in2, in3 1745 Outputs - out0, out1 1746 Return Type - as per RTYPE 1747 Details : Even halfword elements of in0 are copied to the left half of 1748 out0 & even halfword elements of in1 are copied to the right 1749 half of out0. 1750 Even halfword elements of in2 are copied to the left half of 1751 out1 & even halfword elements of in3 are copied to the right 1752 half of out1. 1753 */ 1754 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1755 { \ 1756 out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \ 1757 out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \ 1758 } 1759 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) 1760 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) 1761 1762 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1763 out0, out1, out2, out3) \ 1764 { \ 1765 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1766 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1767 } 1768 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) 1769 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__) 1770 1771 /* Description : Pack even double word elements of vector pairs 1772 Arguments : Inputs - in0, in1, in2, in3 1773 Outputs - out0, out1 1774 Return Type - as per RTYPE 1775 Details : Even double elements of in0 are copied to the left half of 1776 out0 & even double elements of in1 are copied to the right 1777 half of out0. 1778 Even double elements of in2 are copied to the left half of 1779 out1 & even double elements of in3 are copied to the right 1780 half of out1. 1781 */ 1782 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1783 { \ 1784 out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ 1785 out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \ 1786 } 1787 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) 1788 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__) 1789 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) 1790 1791 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1792 out0, out1, out2, out3) \ 1793 { \ 1794 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1795 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1796 } 1797 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) 1798 1799 /* Description : Pack odd double word elements of vector pairs 1800 Arguments : Inputs - in0, in1 1801 Outputs - out0, out1 1802 Return Type - as per RTYPE 1803 Details : As operation is on same input 'in0' vector, index 1 double word 1804 element is overwritten to index 0 and result is written to out0 1805 As operation is on same input 'in1' vector, index 1 double word 1806 element is overwritten to index 0 and result is written to out1 1807 */ 1808 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1809 { \ 1810 out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \ 1811 out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3); \ 1812 } 1813 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__) 1814 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__) 1815 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__) 1816 1817 /* Description : Each byte element is logically xor'ed with immediate 128 1818 Arguments : Inputs - in0, in1 1819 Outputs - in0, in1 (in-place) 1820 Return Type - as per RTYPE 1821 Details : Each unsigned byte element from input vector 'in0' is 1822 logically xor'ed with 128 and result is in-place stored in 1823 'in0' vector 1824 Each unsigned byte element from input vector 'in1' is 1825 logically xor'ed with 128 and result is in-place stored in 1826 'in1' vector 1827 Similar for other pairs 1828 */ 1829 #define XORI_B2_128(RTYPE, in0, in1) \ 1830 { \ 1831 in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \ 1832 in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \ 1833 } 1834 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) 1835 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) 1836 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__) 1837 1838 #define XORI_B3_128(RTYPE, in0, in1, in2) \ 1839 { \ 1840 XORI_B2_128(RTYPE, in0, in1); \ 1841 in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \ 1842 } 1843 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) 1844 1845 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ 1846 { \ 1847 XORI_B2_128(RTYPE, in0, in1); \ 1848 XORI_B2_128(RTYPE, in2, in3); \ 1849 } 1850 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) 1851 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) 1852 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__) 1853 1854 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \ 1855 { \ 1856 XORI_B3_128(RTYPE, in0, in1, in2); \ 1857 XORI_B2_128(RTYPE, in3, in4); \ 1858 } 1859 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__) 1860 1861 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \ 1862 { \ 1863 XORI_B4_128(RTYPE, in0, in1, in2, in3); \ 1864 XORI_B2_128(RTYPE, in4, in5); \ 1865 } 1866 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__) 1867 1868 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \ 1869 { \ 1870 XORI_B4_128(RTYPE, in0, in1, in2, in3); \ 1871 XORI_B3_128(RTYPE, in4, in5, in6); \ 1872 } 1873 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) 1874 1875 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \ 1876 { \ 1877 XORI_B4_128(RTYPE, in0, in1, in2, in3); \ 1878 XORI_B4_128(RTYPE, in4, in5, in6, in7); \ 1879 } 1880 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__) 1881 #define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__) 1882 1883 /* Description : Addition of signed halfword elements and signed saturation 1884 Arguments : Inputs - in0, in1, in2, in3 1885 Outputs - out0, out1 1886 Return Type - as per RTYPE 1887 Details : Signed halfword elements from 'in0' are added to signed 1888 halfword elements of 'in1'. The result is then signed saturated 1889 between -32768 to +32767 (as per halfword data type) 1890 Similar for other pairs 1891 */ 1892 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1893 { \ 1894 out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \ 1895 out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \ 1896 } 1897 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) 1898 1899 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1900 out0, out1, out2, out3) \ 1901 { \ 1902 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1903 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1904 } 1905 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__) 1906 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) 1907 1908 /* Description : Shift left all elements of vector (generic for all data types) 1909 Arguments : Inputs - in0, in1, in2, in3, shift 1910 Outputs - in0, in1, in2, in3 (in place) 1911 Return Type - as per input vector RTYPE 1912 Details : Each element of vector 'in0' is left shifted by 'shift' and 1913 result is in place written to 'in0' 1914 Similar for other pairs 1915 */ 1916 #define SLLI_2V(in0, in1, shift) \ 1917 { \ 1918 in0 = in0 << shift; \ 1919 in1 = in1 << shift; \ 1920 } 1921 #define SLLI_4V(in0, in1, in2, in3, shift) \ 1922 { \ 1923 in0 = in0 << shift; \ 1924 in1 = in1 << shift; \ 1925 in2 = in2 << shift; \ 1926 in3 = in3 << shift; \ 1927 } 1928 1929 /* Description : Arithmetic shift right all elements of vector 1930 (generic for all data types) 1931 Arguments : Inputs - in0, in1, in2, in3, shift 1932 Outputs - in0, in1, in2, in3 (in place) 1933 Return Type - as per input vector RTYPE 1934 Details : Each element of vector 'in0' is right shifted by 'shift' and 1935 result is in place written to 'in0' 1936 Here, 'shift' is GP variable passed in 1937 Similar for other pairs 1938 */ 1939 #define SRA_4V(in0, in1, in2, in3, shift) \ 1940 { \ 1941 in0 = in0 >> shift; \ 1942 in1 = in1 >> shift; \ 1943 in2 = in2 >> shift; \ 1944 in3 = in3 >> shift; \ 1945 } 1946 1947 /* Description : Shift right logical all halfword elements of vector 1948 Arguments : Inputs - in0, in1, in2, in3, shift 1949 Outputs - in0, in1, in2, in3 (in place) 1950 Return Type - as per RTYPE 1951 Details : Each element of vector 'in0' is shifted right logical by 1952 number of bits respective element holds in vector 'shift' and 1953 result is in place written to 'in0' 1954 Here, 'shift' is a vector passed in 1955 Similar for other pairs 1956 */ 1957 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \ 1958 { \ 1959 in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \ 1960 in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \ 1961 in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \ 1962 in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \ 1963 } 1964 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__) 1965 1966 #define SRLR_H4(RTYPE, in0, in1, in2, in3, shift) \ 1967 { \ 1968 in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift); \ 1969 in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift); \ 1970 in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift); \ 1971 in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift); \ 1972 } 1973 #define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__) 1974 #define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__) 1975 1976 #define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift) \ 1977 { \ 1978 SRLR_H4(RTYPE, in0, in1, in2, in3, shift); \ 1979 SRLR_H4(RTYPE, in4, in5, in6, in7, shift); \ 1980 } 1981 #define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__) 1982 #define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__) 1983 1984 /* Description : Shift right arithmetic rounded halfwords 1985 Arguments : Inputs - in0, in1, shift 1986 Outputs - in0, in1, (in place) 1987 Return Type - as per RTYPE 1988 Details : Each element of vector 'in0' is shifted right arithmetic by 1989 number of bits respective element holds in vector 'shift'. 1990 The last discarded bit is added to shifted value for rounding 1991 and the result is in place written to 'in0' 1992 Here, 'shift' is a vector passed in 1993 Similar for other pairs 1994 */ 1995 #define SRAR_H2(RTYPE, in0, in1, shift) \ 1996 { \ 1997 in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \ 1998 in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \ 1999 } 2000 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__) 2001 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__) 2002 2003 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \ 2004 { \ 2005 SRAR_H2(RTYPE, in0, in1, shift) \ 2006 in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \ 2007 } 2008 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__) 2009 2010 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \ 2011 { \ 2012 SRAR_H2(RTYPE, in0, in1, shift) \ 2013 SRAR_H2(RTYPE, in2, in3, shift) \ 2014 } 2015 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__) 2016 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__) 2017 2018 /* Description : Shift right arithmetic rounded words 2019 Arguments : Inputs - in0, in1, shift 2020 Outputs - in0, in1, (in place) 2021 Return Type - as per RTYPE 2022 Details : Each element of vector 'in0' is shifted right arithmetic by 2023 number of bits respective element holds in vector 'shift'. 2024 The last discarded bit is added to shifted value for rounding 2025 and the result is in place written to 'in0' 2026 Here, 'shift' is a vector passed in 2027 Similar for other pairs 2028 */ 2029 #define SRAR_W2(RTYPE, in0, in1, shift) \ 2030 { \ 2031 in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \ 2032 in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \ 2033 } 2034 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__) 2035 2036 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ 2037 { \ 2038 SRAR_W2(RTYPE, in0, in1, shift) \ 2039 SRAR_W2(RTYPE, in2, in3, shift) \ 2040 } 2041 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) 2042 2043 /* Description : Shift right arithmetic rounded (immediate) 2044 Arguments : Inputs - in0, in1, in2, in3, shift 2045 Outputs - in0, in1, in2, in3 (in place) 2046 Return Type - as per RTYPE 2047 Details : Each element of vector 'in0' is shifted right arithmetic by 2048 value in 'shift'. 2049 The last discarded bit is added to shifted value for rounding 2050 and the result is in place written to 'in0' 2051 Similar for other pairs 2052 */ 2053 #define SRARI_H2(RTYPE, in0, in1, shift) \ 2054 { \ 2055 in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \ 2056 in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \ 2057 } 2058 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) 2059 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) 2060 2061 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ 2062 { \ 2063 SRARI_H2(RTYPE, in0, in1, shift); \ 2064 SRARI_H2(RTYPE, in2, in3, shift); \ 2065 } 2066 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) 2067 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) 2068 2069 /* Description : Shift right arithmetic rounded (immediate) 2070 Arguments : Inputs - in0, in1, shift 2071 Outputs - in0, in1 (in place) 2072 Return Type - as per RTYPE 2073 Details : Each element of vector 'in0' is shifted right arithmetic by 2074 value in 'shift'. 2075 The last discarded bit is added to shifted value for rounding 2076 and the result is in place written to 'in0' 2077 Similar for other pairs 2078 */ 2079 #define SRARI_W2(RTYPE, in0, in1, shift) \ 2080 { \ 2081 in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \ 2082 in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \ 2083 } 2084 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) 2085 2086 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ 2087 { \ 2088 SRARI_W2(RTYPE, in0, in1, shift); \ 2089 SRARI_W2(RTYPE, in2, in3, shift); \ 2090 } 2091 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__) 2092 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) 2093 2094 /* Description : Multiplication of pairs of vectors 2095 Arguments : Inputs - in0, in1, in2, in3 2096 Outputs - out0, out1 2097 Details : Each element from 'in0' is multiplied with elements from 'in1' 2098 and result is written to 'out0' 2099 Similar for other pairs 2100 */ 2101 #define MUL2(in0, in1, in2, in3, out0, out1) \ 2102 { \ 2103 out0 = in0 * in1; \ 2104 out1 = in2 * in3; \ 2105 } 2106 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 2107 { \ 2108 MUL2(in0, in1, in2, in3, out0, out1); \ 2109 MUL2(in4, in5, in6, in7, out2, out3); \ 2110 } 2111 2112 /* Description : Addition of 2 pairs of vectors 2113 Arguments : Inputs - in0, in1, in2, in3 2114 Outputs - out0, out1 2115 Details : Each element from 2 pairs vectors is added and 2 results are 2116 produced 2117 */ 2118 #define ADD2(in0, in1, in2, in3, out0, out1) \ 2119 { \ 2120 out0 = in0 + in1; \ 2121 out1 = in2 + in3; \ 2122 } 2123 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 2124 { \ 2125 ADD2(in0, in1, in2, in3, out0, out1); \ 2126 ADD2(in4, in5, in6, in7, out2, out3); \ 2127 } 2128 2129 /* Description : Subtraction of 2 pairs of vectors 2130 Arguments : Inputs - in0, in1, in2, in3 2131 Outputs - out0, out1 2132 Details : Each element from 2 pairs vectors is subtracted and 2 results 2133 are produced 2134 */ 2135 #define SUB2(in0, in1, in2, in3, out0, out1) \ 2136 { \ 2137 out0 = in0 - in1; \ 2138 out1 = in2 - in3; \ 2139 } 2140 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 2141 { \ 2142 out0 = in0 - in1; \ 2143 out1 = in2 - in3; \ 2144 out2 = in4 - in5; \ 2145 out3 = in6 - in7; \ 2146 } 2147 2148 /* Description : Sign extend byte elements from right half of the vector 2149 Arguments : Input - in (byte vector) 2150 Output - out (sign extended halfword vector) 2151 Return Type - signed halfword 2152 Details : Sign bit of byte elements from input vector 'in' is 2153 extracted and interleaved with same vector 'in' to generate 2154 8 halfword elements keeping sign intact 2155 */ 2156 #define UNPCK_R_SB_SH(in, out) \ 2157 { \ 2158 v16i8 sign_m; \ 2159 \ 2160 sign_m = __msa_clti_s_b((v16i8) in, 0); \ 2161 out = (v8i16) __msa_ilvr_b(sign_m, (v16i8) in); \ 2162 } 2163 2164 /* Description : Sign extend halfword elements from right half of the vector 2165 Arguments : Inputs - in (input halfword vector) 2166 Outputs - out (sign extended word vectors) 2167 Return Type - signed word 2168 Details : Sign bit of halfword elements from input vector 'in' is 2169 extracted and interleaved with same vector 'in0' to generate 2170 4 word elements keeping sign intact 2171 */ 2172 #define UNPCK_R_SH_SW(in, out) \ 2173 { \ 2174 v8i16 sign_m; \ 2175 \ 2176 sign_m = __msa_clti_s_h((v8i16) in, 0); \ 2177 out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \ 2178 } 2179 2180 /* Description : Sign extend byte elements from input vector and return 2181 halfword results in pair of vectors 2182 Arguments : Inputs - in (1 input byte vector) 2183 Outputs - out0, out1 (sign extended 2 halfword vectors) 2184 Return Type - signed halfword 2185 Details : Sign bit of byte elements from input vector 'in' is 2186 extracted and interleaved right with same vector 'in0' to 2187 generate 8 signed halfword elements in 'out0' 2188 Then interleaved left with same vector 'in0' to 2189 generate 8 signed halfword elements in 'out1' 2190 */ 2191 #define UNPCK_SB_SH(in, out0, out1) \ 2192 { \ 2193 v16i8 tmp_m; \ 2194 \ 2195 tmp_m = __msa_clti_s_b((v16i8) in, 0); \ 2196 ILVRL_B2_SH(tmp_m, in, out0, out1); \ 2197 } 2198 2199 /* Description : Zero extend unsigned byte elements to halfword elements 2200 Arguments : Inputs - in (1 input unsigned byte vector) 2201 Outputs - out0, out1 (unsigned 2 halfword vectors) 2202 Return Type - signed halfword 2203 Details : Zero extended right half of vector is returned in 'out0' 2204 Zero extended left half of vector is returned in 'out1' 2205 */ 2206 #define UNPCK_UB_SH(in, out0, out1) \ 2207 { \ 2208 v16i8 zero_m = { 0 }; \ 2209 \ 2210 ILVRL_B2_SH(zero_m, in, out0, out1); \ 2211 } 2212 2213 /* Description : Sign extend halfword elements from input vector and return 2214 result in pair of vectors 2215 Arguments : Inputs - in (1 input halfword vector) 2216 Outputs - out0, out1 (sign extended 2 word vectors) 2217 Return Type - signed word 2218 Details : Sign bit of halfword elements from input vector 'in' is 2219 extracted and interleaved right with same vector 'in0' to 2220 generate 4 signed word elements in 'out0' 2221 Then interleaved left with same vector 'in0' to 2222 generate 4 signed word elements in 'out1' 2223 */ 2224 #define UNPCK_SH_SW(in, out0, out1) \ 2225 { \ 2226 v8i16 tmp_m; \ 2227 \ 2228 tmp_m = __msa_clti_s_h((v8i16) in, 0); \ 2229 ILVRL_H2_SW(tmp_m, in, out0, out1); \ 2230 } 2231 2232 /* Description : Swap two variables 2233 Arguments : Inputs - in0, in1 2234 Outputs - in0, in1 (in-place) 2235 Details : Swapping of two input variables using xor 2236 */ 2237 #define SWAP(in0, in1) \ 2238 { \ 2239 in0 = in0 ^ in1; \ 2240 in1 = in0 ^ in1; \ 2241 in0 = in0 ^ in1; \ 2242 } 2243 2244 /* Description : Butterfly of 4 input vectors 2245 Arguments : Inputs - in0, in1, in2, in3 2246 Outputs - out0, out1, out2, out3 2247 Details : Butterfly operation 2248 */ 2249 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ 2250 { \ 2251 out0 = in0 + in3; \ 2252 out1 = in1 + in2; \ 2253 \ 2254 out2 = in1 - in2; \ 2255 out3 = in0 - in3; \ 2256 } 2257 2258 /* Description : Butterfly of 8 input vectors 2259 Arguments : Inputs - in0 ... in7 2260 Outputs - out0 .. out7 2261 Details : Butterfly operation 2262 */ 2263 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \ 2264 out0, out1, out2, out3, out4, out5, out6, out7) \ 2265 { \ 2266 out0 = in0 + in7; \ 2267 out1 = in1 + in6; \ 2268 out2 = in2 + in5; \ 2269 out3 = in3 + in4; \ 2270 \ 2271 out4 = in3 - in4; \ 2272 out5 = in2 - in5; \ 2273 out6 = in1 - in6; \ 2274 out7 = in0 - in7; \ 2275 } 2276 2277 /* Description : Butterfly of 16 input vectors 2278 Arguments : Inputs - in0 ... in15 2279 Outputs - out0 .. out15 2280 Details : Butterfly operation 2281 */ 2282 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \ 2283 in8, in9, in10, in11, in12, in13, in14, in15, \ 2284 out0, out1, out2, out3, out4, out5, out6, out7, \ 2285 out8, out9, out10, out11, out12, out13, out14, out15) \ 2286 { \ 2287 out0 = in0 + in15; \ 2288 out1 = in1 + in14; \ 2289 out2 = in2 + in13; \ 2290 out3 = in3 + in12; \ 2291 out4 = in4 + in11; \ 2292 out5 = in5 + in10; \ 2293 out6 = in6 + in9; \ 2294 out7 = in7 + in8; \ 2295 \ 2296 out8 = in7 - in8; \ 2297 out9 = in6 - in9; \ 2298 out10 = in5 - in10; \ 2299 out11 = in4 - in11; \ 2300 out12 = in3 - in12; \ 2301 out13 = in2 - in13; \ 2302 out14 = in1 - in14; \ 2303 out15 = in0 - in15; \ 2304 } 2305 2306 /* Description : Transposes input 4x4 byte block 2307 Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block) 2308 Outputs - out0, out1, out2, out3 (output 4x4 byte block) 2309 Return Type - unsigned byte 2310 Details : 2311 */ 2312 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \ 2313 { \ 2314 v16i8 zero_m = { 0 }; \ 2315 v16i8 s0_m, s1_m, s2_m, s3_m; \ 2316 \ 2317 ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \ 2318 ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \ 2319 \ 2320 out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \ 2321 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \ 2322 out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \ 2323 out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \ 2324 } 2325 2326 /* Description : Transposes input 8x4 byte block into 4x8 2327 Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block) 2328 Outputs - out0, out1, out2, out3 (output 4x8 byte block) 2329 Return Type - as per RTYPE 2330 Details : 2331 */ 2332 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 2333 out0, out1, out2, out3) \ 2334 { \ 2335 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2336 \ 2337 ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \ 2338 tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \ 2339 ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \ 2340 \ 2341 tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \ 2342 ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \ 2343 \ 2344 ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \ 2345 out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \ 2346 out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \ 2347 } 2348 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__) 2349 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__) 2350 2351 /* Description : Transposes input 8x8 byte block 2352 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 2353 (input 8x8 byte block) 2354 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 2355 (output 8x8 byte block) 2356 Return Type - as per RTYPE 2357 Details : 2358 */ 2359 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 2360 out0, out1, out2, out3, out4, out5, out6, out7) \ 2361 { \ 2362 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2363 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 2364 v16i8 zeros = { 0 }; \ 2365 \ 2366 ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \ 2367 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 2368 ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ 2369 ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ 2370 ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ 2371 ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ 2372 SLDI_B4(RTYPE, zeros, out0, zeros, out2, zeros, out4, zeros, out6, \ 2373 8, out1, out3, out5, out7); \ 2374 } 2375 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) 2376 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__) 2377 2378 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors 2379 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 2380 in8, in9, in10, in11, in12, in13, in14, in15 2381 Outputs - out0, out1, out2, out3 2382 Return Type - unsigned byte 2383 Details : 2384 */ 2385 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 2386 in8, in9, in10, in11, in12, in13, in14, in15, \ 2387 out0, out1, out2, out3) \ 2388 { \ 2389 v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2390 \ 2391 ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \ 2392 out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \ 2393 \ 2394 ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \ 2395 out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \ 2396 \ 2397 ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \ 2398 \ 2399 tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \ 2400 ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \ 2401 \ 2402 tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \ 2403 ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \ 2404 out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ 2405 out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ 2406 \ 2407 tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \ 2408 tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \ 2409 out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ 2410 out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ 2411 } 2412 2413 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors 2414 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 2415 in8, in9, in10, in11, in12, in13, in14, in15 2416 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 2417 Return Type - unsigned byte 2418 Details : 2419 */ 2420 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 2421 in8, in9, in10, in11, in12, in13, in14, in15, \ 2422 out0, out1, out2, out3, out4, out5, out6, out7) \ 2423 { \ 2424 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2425 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 2426 \ 2427 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ 2428 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ 2429 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ 2430 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ 2431 \ 2432 tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \ 2433 tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \ 2434 tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \ 2435 tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \ 2436 out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \ 2437 tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \ 2438 out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \ 2439 tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \ 2440 \ 2441 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ 2442 out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ 2443 out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ 2444 \ 2445 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ 2446 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \ 2447 out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ 2448 out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ 2449 \ 2450 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ 2451 out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ 2452 out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ 2453 \ 2454 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \ 2455 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \ 2456 out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ 2457 out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ 2458 } 2459 2460 /* Description : Transposes 4x4 block with half word elements in vectors 2461 Arguments : Inputs - in0, in1, in2, in3 2462 Outputs - out0, out1, out2, out3 2463 Return Type - signed halfword 2464 Details : 2465 */ 2466 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ 2467 { \ 2468 v8i16 s0_m, s1_m; \ 2469 \ 2470 ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ 2471 ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ 2472 out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \ 2473 out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \ 2474 } 2475 2476 /* Description : Transposes 8x8 block with half word elements in vectors 2477 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 2478 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 2479 Return Type - as per RTYPE 2480 Details : 2481 */ 2482 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 2483 out0, out1, out2, out3, out4, out5, out6, out7) \ 2484 { \ 2485 v8i16 s0_m, s1_m; \ 2486 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2487 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 2488 \ 2489 ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ 2490 ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ 2491 ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ 2492 ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ 2493 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 2494 ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ 2495 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 2496 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ 2497 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \ 2498 tmp3_m, tmp7_m, out0, out2, out4, out6); \ 2499 out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \ 2500 out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \ 2501 out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \ 2502 out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \ 2503 } 2504 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__) 2505 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) 2506 2507 /* Description : Transposes 4x4 block with word elements in vectors 2508 Arguments : Inputs - in0, in1, in2, in3 2509 Outputs - out0, out1, out2, out3 2510 Return Type - signed word 2511 Details : 2512 */ 2513 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ 2514 { \ 2515 v4i32 s0_m, s1_m, s2_m, s3_m; \ 2516 \ 2517 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ 2518 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ 2519 \ 2520 out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \ 2521 out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \ 2522 out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \ 2523 out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \ 2524 } 2525 2526 /* Description : Average byte elements from pair of vectors and store 8x4 byte 2527 block in destination memory 2528 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride 2529 Details : Each byte element from input vector pair 'in0' and 'in1' are 2530 averaged (a + b)/2 and stored in 'tmp0_m' 2531 Each byte element from input vector pair 'in2' and 'in3' are 2532 averaged (a + b)/2 and stored in 'tmp1_m' 2533 Each byte element from input vector pair 'in4' and 'in5' are 2534 averaged (a + b)/2 and stored in 'tmp2_m' 2535 Each byte element from input vector pair 'in6' and 'in7' are 2536 averaged (a + b)/2 and stored in 'tmp3_m' 2537 The half vector results from all 4 vectors are stored in 2538 destination memory as 8x4 byte block 2539 */ 2540 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 2541 { \ 2542 uint64_t out0_m, out1_m, out2_m, out3_m; \ 2543 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2544 \ 2545 tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \ 2546 tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \ 2547 tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \ 2548 tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \ 2549 \ 2550 out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \ 2551 out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \ 2552 out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0); \ 2553 out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0); \ 2554 SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \ 2555 } 2556 2557 /* Description : Average byte elements from pair of vectors and store 16x4 byte 2558 block in destination memory 2559 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride 2560 Details : Each byte element from input vector pair 'in0' and 'in1' are 2561 averaged (a + b)/2 and stored in 'tmp0_m' 2562 Each byte element from input vector pair 'in2' and 'in3' are 2563 averaged (a + b)/2 and stored in 'tmp1_m' 2564 Each byte element from input vector pair 'in4' and 'in5' are 2565 averaged (a + b)/2 and stored in 'tmp2_m' 2566 Each byte element from input vector pair 'in6' and 'in7' are 2567 averaged (a + b)/2 and stored in 'tmp3_m' 2568 The results from all 4 vectors are stored in destination 2569 memory as 16x4 byte block 2570 */ 2571 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 2572 { \ 2573 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2574 \ 2575 tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \ 2576 tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \ 2577 tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \ 2578 tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \ 2579 \ 2580 ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride); \ 2581 } 2582 2583 /* Description : Average rounded byte elements from pair of vectors and store 2584 8x4 byte block in destination memory 2585 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride 2586 Details : Each byte element from input vector pair 'in0' and 'in1' are 2587 average rounded (a + b + 1)/2 and stored in 'tmp0_m' 2588 Each byte element from input vector pair 'in2' and 'in3' are 2589 average rounded (a + b + 1)/2 and stored in 'tmp1_m' 2590 Each byte element from input vector pair 'in4' and 'in5' are 2591 average rounded (a + b + 1)/2 and stored in 'tmp2_m' 2592 Each byte element from input vector pair 'in6' and 'in7' are 2593 average rounded (a + b + 1)/2 and stored in 'tmp3_m' 2594 The half vector results from all 4 vectors are stored in 2595 destination memory as 8x4 byte block 2596 */ 2597 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 2598 { \ 2599 uint64_t out0_m, out1_m, out2_m, out3_m; \ 2600 v16u8 tp0_m, tp1_m, tp2_m, tp3_m; \ 2601 \ 2602 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 2603 tp0_m, tp1_m, tp2_m, tp3_m); \ 2604 \ 2605 out0_m = __msa_copy_u_d((v2i64) tp0_m, 0); \ 2606 out1_m = __msa_copy_u_d((v2i64) tp1_m, 0); \ 2607 out2_m = __msa_copy_u_d((v2i64) tp2_m, 0); \ 2608 out3_m = __msa_copy_u_d((v2i64) tp3_m, 0); \ 2609 SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \ 2610 } 2611 2612 /* Description : Average rounded byte elements from pair of vectors and store 2613 16x4 byte block in destination memory 2614 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride 2615 Details : Each byte element from input vector pair 'in0' and 'in1' are 2616 average rounded (a + b + 1)/2 and stored in 'tmp0_m' 2617 Each byte element from input vector pair 'in2' and 'in3' are 2618 average rounded (a + b + 1)/2 and stored in 'tmp1_m' 2619 Each byte element from input vector pair 'in4' and 'in5' are 2620 average rounded (a + b + 1)/2 and stored in 'tmp2_m' 2621 Each byte element from input vector pair 'in6' and 'in7' are 2622 average rounded (a + b + 1)/2 and stored in 'tmp3_m' 2623 The vector results from all 4 vectors are stored in 2624 destination memory as 16x4 byte block 2625 */ 2626 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 2627 { \ 2628 v16u8 t0_m, t1_m, t2_m, t3_m; \ 2629 \ 2630 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 2631 t0_m, t1_m, t2_m, t3_m); \ 2632 ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride); \ 2633 } 2634 2635 /* Description : Average rounded byte elements from pair of vectors, 2636 average rounded with destination and store 8x4 byte block 2637 in destination memory 2638 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride 2639 Details : Each byte element from input vector pair 'in0' and 'in1' are 2640 average rounded (a + b + 1)/2 and stored in 'tmp0_m' 2641 Each byte element from input vector pair 'in2' and 'in3' are 2642 average rounded (a + b + 1)/2 and stored in 'tmp1_m' 2643 Each byte element from input vector pair 'in4' and 'in5' are 2644 average rounded (a + b + 1)/2 and stored in 'tmp2_m' 2645 Each byte element from input vector pair 'in6' and 'in7' are 2646 average rounded (a + b + 1)/2 and stored in 'tmp3_m' 2647 The half vector results from all 4 vectors are stored in 2648 destination memory as 8x4 byte block 2649 */ 2650 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 2651 pdst, stride) \ 2652 { \ 2653 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2654 v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ 2655 \ 2656 LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \ 2657 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 2658 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 2659 AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \ 2660 dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \ 2661 } 2662 2663 /* Description : Average rounded byte elements from pair of vectors, 2664 average rounded with destination and store 16x4 byte block 2665 in destination memory 2666 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride 2667 Details : Each byte element from input vector pair 'in0' and 'in1' are 2668 average rounded (a + b + 1)/2 and stored in 'tmp0_m' 2669 Each byte element from input vector pair 'in2' and 'in3' are 2670 average rounded (a + b + 1)/2 and stored in 'tmp1_m' 2671 Each byte element from input vector pair 'in4' and 'in5' are 2672 average rounded (a + b + 1)/2 and stored in 'tmp2_m' 2673 Each byte element from input vector pair 'in6' and 'in7' are 2674 average rounded (a + b + 1)/2 and stored in 'tmp3_m' 2675 The vector results from all 4 vectors are stored in 2676 destination memory as 16x4 byte block 2677 */ 2678 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 2679 pdst, stride) \ 2680 { \ 2681 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2682 v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ 2683 \ 2684 LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \ 2685 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 2686 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 2687 AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \ 2688 dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \ 2689 } 2690 2691 /* Description : Add block 4x4 2692 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 2693 Details : Least significant 4 bytes from each input vector are added to 2694 the destination bytes, clipped between 0-255 and then stored. 2695 */ 2696 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ 2697 { \ 2698 uint32_t src0_m, src1_m, src2_m, src3_m; \ 2699 uint32_t out0_m, out1_m, out2_m, out3_m; \ 2700 v8i16 inp0_m, inp1_m, res0_m, res1_m; \ 2701 v16i8 dst0_m = { 0 }; \ 2702 v16i8 dst1_m = { 0 }; \ 2703 v16i8 zero_m = { 0 }; \ 2704 \ 2705 ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ 2706 LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ 2707 INSERT_W2_SB(src0_m, src1_m, dst0_m); \ 2708 INSERT_W2_SB(src2_m, src3_m, dst1_m); \ 2709 ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ 2710 ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ 2711 CLIP_SH2_0_255(res0_m, res1_m); \ 2712 PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ 2713 \ 2714 out0_m = __msa_copy_u_w((v4i32) dst0_m, 0); \ 2715 out1_m = __msa_copy_u_w((v4i32) dst0_m, 1); \ 2716 out2_m = __msa_copy_u_w((v4i32) dst1_m, 0); \ 2717 out3_m = __msa_copy_u_w((v4i32) dst1_m, 1); \ 2718 SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \ 2719 } 2720 2721 /* Description : Dot product and addition of 3 signed halfword input vectors 2722 Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2 2723 Outputs - out0_m 2724 Return Type - signed halfword 2725 Details : Dot product of 'in0' with 'coeff0' 2726 Dot product of 'in1' with 'coeff1' 2727 Dot product of 'in2' with 'coeff2' 2728 Addition of all the 3 vector results 2729 2730 out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2) 2731 */ 2732 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \ 2733 ( { \ 2734 v8i16 out0_m; \ 2735 \ 2736 out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \ 2737 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \ 2738 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \ 2739 \ 2740 out0_m; \ 2741 } ) 2742 2743 /* Description : Pack even elements of input vectors & xor with 128 2744 Arguments : Inputs - in0, in1 2745 Outputs - out_m 2746 Return Type - unsigned byte 2747 Details : Signed byte even elements from 'in0' and 'in1' are packed 2748 together in one vector and the resulted vector is xor'ed with 2749 128 to shift the range from signed to unsigned byte 2750 */ 2751 #define PCKEV_XORI128_UB(in0, in1) \ 2752 ( { \ 2753 v16u8 out_m; \ 2754 out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \ 2755 out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \ 2756 out_m; \ 2757 } ) 2758 2759 /* Description : Converts inputs to unsigned bytes, interleave, average & store 2760 as 8x4 unsigned byte block 2761 Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride 2762 */ 2763 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \ 2764 dst0, dst1, pdst, stride) \ 2765 { \ 2766 v16u8 tmp0_m, tmp1_m; \ 2767 uint8_t *pdst_m = (uint8_t *) (pdst); \ 2768 \ 2769 tmp0_m = PCKEV_XORI128_UB(in0, in1); \ 2770 tmp1_m = PCKEV_XORI128_UB(in2, in3); \ 2771 AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ 2772 ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \ 2773 } 2774 2775 /* Description : Pack even byte elements, extract 0 & 2 index words from pair 2776 of results and store 4 words in destination memory as per 2777 stride 2778 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 2779 */ 2780 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ 2781 { \ 2782 uint32_t out0_m, out1_m, out2_m, out3_m; \ 2783 v16i8 tmp0_m, tmp1_m; \ 2784 \ 2785 PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \ 2786 \ 2787 out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \ 2788 out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \ 2789 out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \ 2790 out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \ 2791 \ 2792 SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \ 2793 } 2794 2795 /* Description : Pack even byte elements and store byte vector in destination 2796 memory 2797 Arguments : Inputs - in0, in1, pdst 2798 */ 2799 #define PCKEV_ST_SB(in0, in1, pdst) \ 2800 { \ 2801 v16i8 tmp_m; \ 2802 tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0); \ 2803 ST_SB(tmp_m, (pdst)); \ 2804 } 2805 2806 /* Description : Horizontal 2 tap filter kernel code 2807 Arguments : Inputs - in0, in1, mask, coeff, shift 2808 */ 2809 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ 2810 ( { \ 2811 v16i8 tmp0_m; \ 2812 v8u16 tmp1_m; \ 2813 \ 2814 tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0); \ 2815 tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff); \ 2816 tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift); \ 2817 tmp1_m = __msa_sat_u_h(tmp1_m, shift); \ 2818 \ 2819 tmp1_m; \ 2820 } ) 2821 #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */ 2822