1/****************************************************************************** 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2023, Nathan Egge 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 *****************************************************************************/ 27 28#include "src/riscv/asm.S" 29 30function inv_txfm_add_4x4_rvv, export=1, ext=v 31 csrw vxrm, zero 32 33 vsetivli zero, 4, e16, mf2, ta, ma 34 vle16.v v0, (a2) 35 addi t0, a2, 8 36 vle16.v v1, (t0) 37 addi t0, t0, 8 38 vle16.v v2, (t0) 39 addi t0, t0, 8 40 vle16.v v3, (t0) 41 42 jalr t0, a4 43 44 vmv.v.x v4, zero 45 46 vsseg4e16.v v0, (a2) 47 vle16.v v0, (a2) 48 vse16.v v4, (a2) 49 addi t0, a2, 8 50 vle16.v v1, (t0) 51 vse16.v v4, (t0) 52 addi t0, t0, 8 53 vle16.v v2, (t0) 54 vse16.v v4, (t0) 55 addi t0, t0, 8 56 vle16.v v3, (t0) 57 vse16.v v4, (t0) 58 59 jalr t0, a5 60 61 vssra.vi v0, v0, 4 62 vssra.vi v1, v1, 4 63 vssra.vi v2, v2, 4 64 vssra.vi v3, v3, 4 65 66itx_4x4_end: 67 vsetvli zero, zero, e8, mf4, ta, ma 68 vle8.v v4, (a0) 69 add t0, a0, a1 70 vle8.v v5, (t0) 71 add t0, t0, a1 72 vle8.v v6, (t0) 73 add t0, t0, a1 74 vle8.v v7, (t0) 75 76 vwaddu.wv v0, v0, v4 77 vwaddu.wv v1, v1, v5 78 vwaddu.wv v2, v2, v6 79 vwaddu.wv v3, v3, v7 80 81 vsetvli zero, zero, e16, mf2, ta, ma 82 vmax.vx v0, v0, zero 83 vmax.vx v1, v1, zero 84 vmax.vx v2, v2, zero 85 vmax.vx v3, v3, zero 86 87 vsetvli zero, zero, e8, mf4, ta, ma 88 89 vnclipu.wi v4, v0, 0 90 vnclipu.wi v5, v1, 0 91 vnclipu.wi v6, v2, 0 92 vnclipu.wi v7, v3, 0 93 94 vse8.v v4, (a0) 95 add a0, a0, a1 96 vse8.v v5, (a0) 97 add a0, a0, a1 98 vse8.v v6, (a0) 99 add a0, a0, a1 100 vse8.v v7, (a0) 101 102 ret 103endfunc 104 105function inv_identity_e16_x4_rvv, export=1, ext=v 106 li t1, (5793-4096)*8 107 vsmul.vx v4, v0, t1 108 vsmul.vx v5, v1, t1 109 vsmul.vx v6, v2, t1 110 vsmul.vx v7, v3, t1 111 112 vsadd.vv v0, v0, v4 113 vsadd.vv v1, v1, v5 114 vsadd.vv v2, v2, v6 115 vsadd.vv v3, v3, v7 116 117 jr t0 118endfunc 119 120.macro iwht_4 121 vadd.vv v0, v0, v1 122 vsub.vv v5, v2, v3 123 vsub.vv v4, v0, v5 124 vsra.vi v4, v4, 1 125 vsub.vv v2, v4, v1 126 vsub.vv v1, v4, v3 127 vadd.vv v3, v5, v2 128 vsub.vv v0, v0, v1 129.endm 130 131.macro idct_4 o0, o1, o2, o3 132 li t1, 2896 133 li t2, 1567 134 li t3, 3784 135 136 vwmul.vx v16, \o0, t1 137 vwmul.vx v18, \o0, t1 138 vwmacc.vx v16, t1, \o2 139 neg t1, t1 140 vwmacc.vx v18, t1, \o2 141 142 vwmul.vx v20, \o1, t3 143 neg t3, t3 144 vwmul.vx v22, \o1, t2 145 vwmacc.vx v20, t2, \o3 146 vwmacc.vx v22, t3, \o3 147 148 li t1, 2048 149 150 vwadd.wx v16, v16, t1 151 vwadd.wx v18, v18, t1 152 vwadd.wx v20, v20, t1 153 vwadd.wx v22, v22, t1 154 155 vnsra.wi v16, v16, 12 156 vnsra.wi v18, v18, 12 157 vnsra.wi v20, v20, 12 158 vnsra.wi v22, v22, 12 159 160 vsadd.vv \o0, v16, v20 161 vsadd.vv \o1, v18, v22 162 vssub.vv \o2, v18, v22 163 vssub.vv \o3, v16, v20 164.endm 165 166.macro iadst_4 o0, o1, o2, o3, lm2, lm 167 li t1, 1321 168 li t2, 3803 169 li t3, 2482 170 171 vwmul.vx v16, v0, t1 172 vwmul.vx v18, v0, t3 173 neg t1, t1 174 vwmacc.vx v16, t2, v2 175 vwmacc.vx v18, t1, v2 176 neg t2, t2 177 vwmacc.vx v16, t3, v3 178 vwmacc.vx v18, t2, v3 179 180 vwsub.vv v20, v0, v2 181 vwadd.wv v20, v20, v3 182 183 li t1, 3344 184 vwmul.vx v22, v1, t1 185 186 vsetvli zero, zero, e32, \lm2, ta, ma 187 188 vmul.vx v20, v20, t1 189 190 vadd.vv v24, v16, v18 191 vadd.vv v16, v16, v22 192 vadd.vv v18, v18, v22 193 vsub.vv v22, v24, v22 194 195 li t1, 2048 196 197 vadd.vx v16, v16, t1 198 vadd.vx v18, v18, t1 199 vadd.vx v20, v20, t1 200 vadd.vx v22, v22, t1 201 202 vsetvli zero, zero, e16, \lm, ta, ma 203 204 vnsra.wi \o0, v16, 12 205 vnsra.wi \o1, v18, 12 206 vnsra.wi \o2, v20, 12 207 vnsra.wi \o3, v22, 12 208.endm 209 210function inv_dct_e16_x4_rvv, export=1, ext=v 211 idct_4 v0, v1, v2, v3 212 jr t0 213endfunc 214 215function inv_adst_e16_x4_rvv, export=1, ext=v 216 iadst_4 v0, v1, v2, v3, m1, mf2 217 jr t0 218endfunc 219 220function inv_flipadst_e16_x4_rvv, export=1, ext=v 221 iadst_4 v3, v2, v1, v0, m1, mf2 222 jr t0 223endfunc 224 225function inv_adst_e16_x4w_rvv, export=1, ext=v 226 iadst_4 v0, v1, v2, v3, m2, m1 227 jr t0 228endfunc 229 230function inv_flipadst_e16_x4w_rvv, export=1, ext=v 231 iadst_4 v3, v2, v1, v0, m2, m1 232 jr t0 233endfunc 234 235function inv_txfm_add_wht_wht_4x4_8bpc_rvv, export=1, ext=v 236 csrw vxrm, zero 237 238 vsetivli zero, 4, e16, mf2, ta, ma 239 vle16.v v0, (a2) 240 addi t0, a2, 8 241 vle16.v v1, (t0) 242 addi t0, t0, 8 243 vle16.v v2, (t0) 244 addi t0, t0, 8 245 vle16.v v3, (t0) 246 247 vsra.vi v0, v0, 2 248 vsra.vi v1, v1, 2 249 vsra.vi v2, v2, 2 250 vsra.vi v3, v3, 2 251 252 iwht_4 253 254 vmv.v.x v4, zero 255 256 vsseg4e16.v v0, (a2) 257 vle16.v v0, (a2) 258 vse16.v v4, (a2) 259 addi t0, a2, 8 260 vle16.v v1, (t0) 261 vse16.v v4, (t0) 262 addi t0, t0, 8 263 vle16.v v2, (t0) 264 vse16.v v4, (t0) 265 addi t0, t0, 8 266 vle16.v v3, (t0) 267 vse16.v v4, (t0) 268 269 iwht_4 270 271 j itx_4x4_end 272endfunc 273 274.macro def_fn_4x4 txfm1, txfm2 275function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v 276.ifc \txfm1\()_\txfm2, dct_dct 277 beqz a3, 1f 278.endif 279 la a4, inv_\txfm1\()_e16_x4_rvv 280 la a5, inv_\txfm2\()_e16_x4_rvv 281 j inv_txfm_add_4x4_rvv 282.ifc \txfm1\()_\txfm2, dct_dct 2831: 284 csrw vxrm, zero 285 vsetivli zero, 4, e16, mf2, ta, ma 286 ld t2, (a2) 287 li t1, 2896*8 288 vmv.v.x v0, t2 289 vsmul.vx v0, v0, t1 290 sd x0, (a2) 291 vsmul.vx v0, v0, t1 292 vssra.vi v0, v0, 4 293 vmv.v.v v1, v0 294 vmv.v.v v2, v0 295 vmv.v.v v3, v0 296 j itx_4x4_end 297.endif 298endfunc 299.endm 300 301def_fn_4x4 dct, dct 302def_fn_4x4 identity, identity 303def_fn_4x4 dct, adst 304def_fn_4x4 dct, flipadst 305def_fn_4x4 dct, identity 306def_fn_4x4 adst, dct 307def_fn_4x4 adst, adst 308def_fn_4x4 adst, flipadst 309def_fn_4x4 flipadst, dct 310def_fn_4x4 flipadst, adst 311def_fn_4x4 flipadst, flipadst 312def_fn_4x4 identity, dct 313def_fn_4x4 adst, identity 314def_fn_4x4 flipadst, identity 315def_fn_4x4 identity, adst 316def_fn_4x4 identity, flipadst 317 318.macro def_fn_8x8_base variant 319function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v 320 csrw vxrm, zero 321 322 vsetivli zero, 8, e16, m1, ta, ma 323 vle16.v v0, (a2) 324 addi t0, a2, 16 325 vle16.v v1, (t0) 326 addi t0, t0, 16 327 vle16.v v2, (t0) 328 addi t0, t0, 16 329 vle16.v v3, (t0) 330 addi t0, t0, 16 331 vle16.v v4, (t0) 332 addi t0, t0, 16 333 vle16.v v5, (t0) 334 addi t0, t0, 16 335 vle16.v v6, (t0) 336 addi t0, t0, 16 337 vle16.v v7, (t0) 338 339.ifc \variant, identity_ 340 // The identity vsadd.vv and downshift vssra.vi 1 cancel out 341 342 j L(itx_8x8_epilog) 343.else 344 jalr t0, a4 345 346 vssra.vi v0, v0, 1 347 vssra.vi v1, v1, 1 348 vssra.vi v2, v2, 1 349 vssra.vi v3, v3, 1 350 vssra.vi v4, v4, 1 351 vssra.vi v5, v5, 1 352 vssra.vi v6, v6, 1 353 vssra.vi v7, v7, 1 354 355L(itx_8x8_epilog): 356 vsseg8e16.v v0, (a2) 357 vle16.v v0, (a2) 358 addi t0, a2, 16 359 vle16.v v1, (t0) 360 addi t0, t0, 16 361 vle16.v v2, (t0) 362 addi t0, t0, 16 363 vle16.v v3, (t0) 364 addi t0, t0, 16 365 vle16.v v4, (t0) 366 addi t0, t0, 16 367 vle16.v v5, (t0) 368 addi t0, t0, 16 369 vle16.v v6, (t0) 370 addi t0, t0, 16 371 vle16.v v7, (t0) 372 373 jalr t0, a5 374 375 vssra.vi v0, v0, 4 376 vssra.vi v1, v1, 4 377 vssra.vi v2, v2, 4 378 vssra.vi v3, v3, 4 379 vssra.vi v4, v4, 4 380 vssra.vi v5, v5, 4 381 vssra.vi v6, v6, 4 382 vssra.vi v7, v7, 4 383 384 li t1, 64 385 vsetvli zero, t1, e16, m8, ta, ma 386 vmv.v.x v8, zero 387 vse16.v v8, (a2) 388 389itx_8x8_end: 390 vsetivli zero, 8, e8, mf2, ta, ma 391 vle8.v v8, (a0) 392 add t0, a0, a1 393 vle8.v v9, (t0) 394 add t0, t0, a1 395 vle8.v v10, (t0) 396 add t0, t0, a1 397 vle8.v v11, (t0) 398 add t0, t0, a1 399 vle8.v v12, (t0) 400 add t0, t0, a1 401 vle8.v v13, (t0) 402 add t0, t0, a1 403 vle8.v v14, (t0) 404 add t0, t0, a1 405 vle8.v v15, (t0) 406 407 vwaddu.wv v0, v0, v8 408 vwaddu.wv v1, v1, v9 409 vwaddu.wv v2, v2, v10 410 vwaddu.wv v3, v3, v11 411 vwaddu.wv v4, v4, v12 412 vwaddu.wv v5, v5, v13 413 vwaddu.wv v6, v6, v14 414 vwaddu.wv v7, v7, v15 415 416 vsetvli zero, zero, e16, m1, ta, ma 417 vmax.vx v0, v0, zero 418 vmax.vx v1, v1, zero 419 vmax.vx v2, v2, zero 420 vmax.vx v3, v3, zero 421 vmax.vx v4, v4, zero 422 vmax.vx v5, v5, zero 423 vmax.vx v6, v6, zero 424 vmax.vx v7, v7, zero 425 426 vsetvli zero, zero, e8, mf2, ta, ma 427 428 vnclipu.wi v8, v0, 0 429 vnclipu.wi v9, v1, 0 430 vnclipu.wi v10, v2, 0 431 vnclipu.wi v11, v3, 0 432 vnclipu.wi v12, v4, 0 433 vnclipu.wi v13, v5, 0 434 vnclipu.wi v14, v6, 0 435 vnclipu.wi v15, v7, 0 436 437 vse8.v v8, (a0) 438 add a0, a0, a1 439 vse8.v v9, (a0) 440 add a0, a0, a1 441 vse8.v v10, (a0) 442 add a0, a0, a1 443 vse8.v v11, (a0) 444 add a0, a0, a1 445 vse8.v v12, (a0) 446 add a0, a0, a1 447 vse8.v v13, (a0) 448 add a0, a0, a1 449 vse8.v v14, (a0) 450 add a0, a0, a1 451 vse8.v v15, (a0) 452 453 ret 454.endif 455endfunc 456.endm 457 458def_fn_8x8_base identity_ 459def_fn_8x8_base 460 461function inv_identity_e16_x8_rvv, export=1, ext=v 462 vsadd.vv v0, v0, v0 463 vsadd.vv v1, v1, v1 464 vsadd.vv v2, v2, v2 465 vsadd.vv v3, v3, v3 466 vsadd.vv v4, v4, v4 467 vsadd.vv v5, v5, v5 468 vsadd.vv v6, v6, v6 469 vsadd.vv v7, v7, v7 470 471 jr t0 472endfunc 473 474.macro idct_8 o0, o1, o2, o3, o4, o5, o6, o7 475 idct_4 \o0, \o2, \o4, \o6 476 477 li t1, 799 478 li t2, 4017 479 li t3, 3406 480 li t4, 2276 481 482 vwmul.vx v22, \o1, t2 483 neg t2, t2 484 vwmul.vx v16, \o1, t1 485 vwmacc.vx v22, t1, \o7 486 vwmacc.vx v16, t2, \o7 487 488 vwmul.vx v20, \o5, t4 489 neg t4, t4 490 vwmul.vx v18, \o5, t3 491 vwmacc.vx v20, t3, \o3 492 vwmacc.vx v18, t4, \o3 493 494 li t1, 2048 495 496 vwadd.wx v16, v16, t1 497 vwadd.wx v18, v18, t1 498 vwadd.wx v20, v20, t1 499 vwadd.wx v22, v22, t1 500 501 vnsra.wi v16, v16, 12 502 vnsra.wi v18, v18, 12 503 vnsra.wi v20, v20, 12 504 vnsra.wi v22, v22, 12 505 506 vssub.vv \o7, v22, v20 507 vsadd.vv v22, v22, v20 508 vssub.vv \o1, v16, v18 509 vsadd.vv v16, v16, v18 510 511 li t2, 2896 512 513 vwmul.vx v18, \o7, t2 514 vwmul.vx v20, \o7, t2 515 vwmacc.vx v20, t2, \o1 516 neg t2, t2 517 vwmacc.vx v18, t2, \o1 518 519 vwadd.wx v18, v18, t1 520 vwadd.wx v20, v20, t1 521 522 vnsra.wi v18, v18, 12 523 vnsra.wi v20, v20, 12 524 525 vssub.vv \o7, \o0, v22 526 vsadd.vv \o0, \o0, v22 527 vssub.vv v17, \o2, v20 528 vsadd.vv \o1, \o2, v20 529 vssub.vv \o5, \o4, v18 530 vsadd.vv \o2, \o4, v18 531 vssub.vv \o4, \o6, v16 532 vsadd.vv \o3, \o6, v16 533 vmv.v.v \o6, v17 534.endm 535 536.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 537 li t1, 4076 538 li t2, 401 539 li t3, 3612 540 li t4, 1931 541 li t5, 2598 542 li t6, 3166 543 544 vwmul.vx v16, v7, t1 545 neg t1, t1 546 vwmul.vx v18, v7, t2 547 vwmacc.vx v16, t2, v0 548 vwmacc.vx v18, t1, v0 549 550 vwmul.vx v20, v5, t3 551 neg t3, t3 552 vwmul.vx v22, v5, t4 553 vwmacc.vx v20, t4, v2 554 vwmacc.vx v22, t3, v2 555 556 vwmul.vx v24, v3, t5 557 neg t5, t5 558 vwmul.vx v26, v3, t6 559 vwmacc.vx v24, t6, v4 560 vwmacc.vx v26, t5, v4 561 562 li t1, 2048 563 li t2, 1189 564 li t3, 3920 565 li t4, 1567 566 li t5, 3784 567 li t6, 2896 568 569 vwmul.vx v28, v1, t2 570 neg t2, t2 571 vwmul.vx v30, v1, t3 572 vwmacc.vx v28, t3, v6 573 vwmacc.vx v30, t2, v6 574 575 vwadd.wx v16, v16, t1 576 vwadd.wx v18, v18, t1 577 vwadd.wx v20, v20, t1 578 vwadd.wx v22, v22, t1 579 vwadd.wx v24, v24, t1 580 vwadd.wx v26, v26, t1 581 vwadd.wx v28, v28, t1 582 vwadd.wx v30, v30, t1 583 584 vnsra.wi v16, v16, 12 585 vnsra.wi v18, v18, 12 586 vnsra.wi v20, v20, 12 587 vnsra.wi v22, v22, 12 588 vnsra.wi v24, v24, 12 589 vnsra.wi v26, v26, 12 590 vnsra.wi v28, v28, 12 591 vnsra.wi v30, v30, 12 592 593 vssub.vv v4, v16, v24 594 vsadd.vv v16, v16, v24 595 vsadd.vv v1, v18, v26 596 vsadd.vv v2, v20, v28 597 vsadd.vv v3, v22, v30 598 vssub.vv v5, v18, v26 599 vssub.vv v6, v20, v28 600 vssub.vv v30, v22, v30 601 602 vsadd.vv \o0, v16, v2 603 vsadd.vv \o7, v1, v3 604 vssub.vv v2, v16, v2 605 vssub.vv v3, v1, v3 606 607 vwmul.vx v16, v4, t5 608 vwmul.vx v18, v4, t4 609 vwmul.vx v20, v30, t5 610 vwmul.vx v22, v30, t4 611 vwmacc.vx v16, t4, v5 612 neg t4, t4 613 vwmacc.vx v22, t5, v6 614 neg t5, t5 615 vwmacc.vx v20, t4, v6 616 vwmacc.vx v18, t5, v5 617 618 vwadd.wx v16, v16, t1 619 vwadd.wx v18, v18, t1 620 vwadd.wx v20, v20, t1 621 vwadd.wx v22, v22, t1 622 623 vnsra.wi v16, v16, 12 624 vnsra.wi v18, v18, 12 625 vnsra.wi v20, v20, 12 626 vnsra.wi v22, v22, 12 627 628 vsadd.vv \o1, v16, v20 629 vsadd.vv \o6, v18, v22 630 vssub.vv v16, v16, v20 631 vssub.vv v17, v18, v22 632 633 vwmul.vx v18, v2, t6 634 vwmul.vx v20, v2, t6 635 vwmul.vx v22, v16, t6 636 vwmul.vx v24, v16, t6 637 vwmacc.vx v18, t6, v3 638 vwmacc.vx v22, t6, v17 639 neg t6, t6 640 vwmacc.vx v20, t6, v3 641 vwmacc.vx v24, t6, v17 642 643 vwadd.wx v18, v18, t1 644 vwadd.wx v20, v20, t1 645 vwadd.wx v22, v22, t1 646 vwadd.wx v24, v24, t1 647 648 vnsra.wi \o3, v18, 12 649 vnsra.wi \o4, v20, 12 650 vnsra.wi \o2, v22, 12 651 vnsra.wi \o5, v24, 12 652 653 vmv.v.x v16, zero 654 vssub.vv \o1, v16, \o1 655 vssub.vv \o3, v16, \o3 656 vssub.vv \o5, v16, \o5 657 vssub.vv \o7, v16, \o7 658.endm 659 660function inv_dct_e16_x8_rvv, export=1, ext=v 661 idct_8 v0, v1, v2, v3, v4, v5, v6, v7 662 jr t0 663endfunc 664 665function inv_adst_e16_x8_rvv, export=1, ext=v 666 iadst_8 v0, v1, v2, v3, v4, v5, v6, v7 667 jr t0 668endfunc 669 670function inv_flipadst_e16_x8_rvv, export=1, ext=v 671 iadst_8 v7, v6, v5, v4, v3, v2, v1, v0 672 jr t0 673endfunc 674 675.macro def_fn_8x8 txfm1, txfm2 676function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1, ext=v 677.ifc \txfm1\()_\txfm2, dct_dct 678 beqz a3, 1f 679.endif 680 la a5, inv_\txfm2\()_e16_x8_rvv 681.ifc \txfm1, identity 682 j inv_txfm_identity_add_8x8_rvv 683.else 684 la a4, inv_\txfm1\()_e16_x8_rvv 685 j inv_txfm_add_8x8_rvv 686.endif 687.ifc \txfm1\()_\txfm2, dct_dct 6881: 689 csrw vxrm, zero 690 vsetivli zero, 8, e16, m1, ta, ma 691 ld t2, (a2) 692 li t1, 2896*8 693 vmv.v.x v0, t2 694 vsmul.vx v0, v0, t1 695 sd x0, (a2) 696 vssra.vi v0, v0, 1 697 vsmul.vx v0, v0, t1 698 vssra.vi v0, v0, 4 699 vmv.v.v v1, v0 700 vmv.v.v v2, v0 701 vmv.v.v v3, v0 702 vmv.v.v v4, v0 703 vmv.v.v v5, v0 704 vmv.v.v v6, v0 705 vmv.v.v v7, v0 706 j itx_8x8_end 707.endif 708endfunc 709.endm 710 711def_fn_8x8 dct, dct 712def_fn_8x8 identity, identity 713def_fn_8x8 dct, adst 714def_fn_8x8 dct, flipadst 715def_fn_8x8 dct, identity 716def_fn_8x8 adst, dct 717def_fn_8x8 adst, adst 718def_fn_8x8 adst, flipadst 719def_fn_8x8 flipadst, dct 720def_fn_8x8 flipadst, adst 721def_fn_8x8 flipadst, flipadst 722def_fn_8x8 identity, dct 723def_fn_8x8 adst, identity 724def_fn_8x8 flipadst, identity 725def_fn_8x8 identity, adst 726def_fn_8x8 identity, flipadst 727 728function inv_txfm_add_4x8_rvv, export=1, ext=v 729 csrw vxrm, zero 730 731 vsetivli zero, 8, e16, m1, ta, ma 732 vle16.v v0, (a2) 733 addi t0, a2, 16 734 vle16.v v1, (t0) 735 addi t0, t0, 16 736 vle16.v v2, (t0) 737 addi t0, t0, 16 738 vle16.v v3, (t0) 739 740 li t1, 2896*8 741.irp i, 0, 1, 2, 3 742 vsmul.vx v\i, v\i, t1 743.endr 744 745 jalr t0, a4 746 747 vsseg4e16.v v0, (a2) 748 749 vsetivli zero, 4, e16, mf2, ta, ma 750 vmv.v.x v8, zero 751 vle16.v v0, (a2) 752 vse16.v v8, (a2) 753.irp i, 1, 2, 3, 4, 5, 6, 7 754 addi a2, a2, 8 755 vle16.v v\i, (a2) 756 vse16.v v8, (a2) 757.endr 758 759 jalr t0, a5 760 761.irp i, 0, 1, 2, 3, 4, 5, 6, 7 762 vssra.vi v\i, v\i, 4 763.endr 764 765 vsetvli zero, zero, e8, mf4, ta, ma 766 vle8.v v8, (a0) 767 add t0, a0, a1 768 vle8.v v9, (t0) 769.irp i, 10, 11, 12, 13, 14, 15 770 add t0, t0, a1 771 vle8.v v\i, (t0) 772.endr 773 774 vwaddu.wv v0, v0, v8 775 vwaddu.wv v1, v1, v9 776 vwaddu.wv v2, v2, v10 777 vwaddu.wv v3, v3, v11 778 vwaddu.wv v4, v4, v12 779 vwaddu.wv v5, v5, v13 780 vwaddu.wv v6, v6, v14 781 vwaddu.wv v7, v7, v15 782 783 vsetvli zero, zero, e16, mf2, ta, ma 784.irp i, 0, 1, 2, 3, 4, 5, 6, 7 785 vmax.vx v\i, v\i, zero 786.endr 787 788 vsetvli zero, zero, e8, mf4, ta, ma 789 790 vnclipu.wi v8, v0, 0 791 vnclipu.wi v9, v1, 0 792 vnclipu.wi v10, v2, 0 793 vnclipu.wi v11, v3, 0 794 vnclipu.wi v12, v4, 0 795 vnclipu.wi v13, v5, 0 796 vnclipu.wi v14, v6, 0 797 vnclipu.wi v15, v7, 0 798 799 vse8.v v8, (a0) 800.irp i, 9, 10, 11, 12, 13, 14, 15 801 add a0, a0, a1 802 vse8.v v\i, (a0) 803.endr 804 805 ret 806endfunc 807 808function inv_txfm_add_8x4_rvv, export=1, ext=v 809 csrw vxrm, zero 810 811 vsetivli zero, 4, e16, mf2, ta, ma 812 vle16.v v0, (a2) 813 addi t0, a2, 8 814 vle16.v v1, (t0) 815.irp i, 2, 3, 4, 5, 6, 7 816 addi t0, t0, 8 817 vle16.v v\i, (t0) 818.endr 819 820 li t1, 2896*8 821.irp i, 0, 1, 2, 3, 4, 5, 6, 7 822 vsmul.vx v\i, v\i, t1 823.endr 824 825 jalr t0, a4 826 827 vsseg8e16.v v0, (a2) 828 829 vsetivli zero, 8, e16, m1, ta, ma 830 vmv.v.x v4, zero 831 vle16.v v0, (a2) 832 vse16.v v4, (a2) 833.irp i, 1, 2, 3 834 addi a2, a2, 16 835 vle16.v v\i, (a2) 836 vse16.v v4, (a2) 837.endr 838 839 jalr t0, a5 840 841 vssra.vi v0, v0, 4 842 vssra.vi v1, v1, 4 843 vssra.vi v2, v2, 4 844 vssra.vi v3, v3, 4 845 846 vsetvli zero, zero, e8, mf2, ta, ma 847 vle8.v v4, (a0) 848 add t0, a0, a1 849 vle8.v v5, (t0) 850 add t0, t0, a1 851 vle8.v v6, (t0) 852 add t0, t0, a1 853 vle8.v v7, (t0) 854 855 vwaddu.wv v0, v0, v4 856 vwaddu.wv v1, v1, v5 857 vwaddu.wv v2, v2, v6 858 vwaddu.wv v3, v3, v7 859 860 vsetvli zero, zero, e16, m1, ta, ma 861 vmax.vx v0, v0, zero 862 vmax.vx v1, v1, zero 863 vmax.vx v2, v2, zero 864 vmax.vx v3, v3, zero 865 866 vsetvli zero, zero, e8, mf2, ta, ma 867 868 vnclipu.wi v4, v0, 0 869 vnclipu.wi v5, v1, 0 870 vnclipu.wi v6, v2, 0 871 vnclipu.wi v7, v3, 0 872 873 vse8.v v4, (a0) 874 add a0, a0, a1 875 vse8.v v5, (a0) 876 add a0, a0, a1 877 vse8.v v6, (a0) 878 add a0, a0, a1 879 vse8.v v7, (a0) 880 881 ret 882endfunc 883 884/* Define symbols added in .if statement */ 885.equ dct, 1 886.equ identity, 2 887.equ adst, 3 888.equ flipadst, 4 889 890.macro def_fn_48 w, h, txfm1, txfm2 891function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1 892.if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst) 893 la a4, inv_\txfm1\()_e16_x\w\()w_rvv 894.else 895 la a4, inv_\txfm1\()_e16_x\w\()_rvv 896.endif 897.if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst) 898 la a5, inv_\txfm2\()_e16_x\h\()w_rvv 899.else 900 la a5, inv_\txfm2\()_e16_x\h\()_rvv 901.endif 902 j inv_txfm_add_\w\()x\h\()_rvv 903endfunc 904.endm 905 906.macro def_fns_48 w, h 907def_fn_48 \w, \h, dct, dct 908def_fn_48 \w, \h, identity, identity 909def_fn_48 \w, \h, dct, adst 910def_fn_48 \w, \h, dct, flipadst 911def_fn_48 \w, \h, dct, identity 912def_fn_48 \w, \h, adst, dct 913def_fn_48 \w, \h, adst, adst 914def_fn_48 \w, \h, adst, flipadst 915def_fn_48 \w, \h, flipadst, dct 916def_fn_48 \w, \h, flipadst, adst 917def_fn_48 \w, \h, flipadst, flipadst 918def_fn_48 \w, \h, identity, dct 919def_fn_48 \w, \h, adst, identity 920def_fn_48 \w, \h, flipadst, identity 921def_fn_48 \w, \h, identity, adst 922def_fn_48 \w, \h, identity, flipadst 923.endm 924 925def_fns_48 4, 8 926def_fns_48 8, 4 927 928function inv_identity_e16_x16_rvv, export=1, ext=v 929 li t1, 2*(5793-4096)*8 930.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 931 vsmul.vx v16, v\i, t1 932 vsadd.vv v\i, v\i, v\i 933 vsadd.vv v\i, v\i, v16 934.endr 935 jr t0 936endfunc 937 938function inv_dct_e16_x16_rvv, export=1, ext=v 939 idct_8 v0, v2, v4, v6, v8, v10, v12, v14 940 941 li t1, 401 942 li t2, 4076 943 li t3, 3166 944 li t4, 2598 945 946 vwmul.vx v30, v1, t2 947 neg t2, t2 948 vwmul.vx v16, v1, t1 949 vwmacc.vx v30, t1, v15 950 vwmacc.vx v16, t2, v15 951 952 vwmul.vx v28, v9, t4 953 neg t4, t4 954 vwmul.vx v18, v9, t3 955 vwmacc.vx v28, t3, v7 956 vwmacc.vx v18, t4, v7 957 958 li t1, 1931 959 li t2, 3612 960 li t3, 3920 961 li t4, 1189 962 963 vwmul.vx v26, v5, t2 964 neg t2, t2 965 vwmul.vx v20, v5, t1 966 vwmacc.vx v26, t1, v11 967 vwmacc.vx v20, t2, v11 968 969 vwmul.vx v24, v13, t4 970 neg t4, t4 971 vwmul.vx v22, v13, t3 972 vwmacc.vx v24, t3, v3 973 vwmacc.vx v22, t4, v3 974 975 li t1, 2048 976 li t2, 2896 977 li t3, 1567 978 li t4, 3784 979 980 vwadd.wx v16, v16, t1 981 vwadd.wx v18, v18, t1 982 vwadd.wx v20, v20, t1 983 vwadd.wx v22, v22, t1 984 vwadd.wx v24, v24, t1 985 vwadd.wx v26, v26, t1 986 vwadd.wx v28, v28, t1 987 vwadd.wx v30, v30, t1 988 989 vnsra.wi v16, v16, 12 990 vnsra.wi v18, v18, 12 991 vnsra.wi v20, v20, 12 992 vnsra.wi v22, v22, 12 993 vnsra.wi v24, v24, 12 994 vnsra.wi v26, v26, 12 995 vnsra.wi v28, v28, 12 996 vnsra.wi v30, v30, 12 997 998 vssub.vv v3, v16, v18 999 vsadd.vv v16, v16, v18 1000 vssub.vv v5, v22, v20 1001 vsadd.vv v22, v22, v20 1002 vssub.vv v11, v24, v26 1003 vsadd.vv v24, v24, v26 1004 vssub.vv v13, v30, v28 1005 vsadd.vv v30, v30, v28 1006 1007 vwmul.vx v28, v13, t4 1008 neg t4, t4 1009 vwmul.vx v18, v13, t3 1010 vwmul.vx v26, v11, t3 1011 vwmacc.vx v28, t3, v3 1012 neg t3, t3 1013 vwmul.vx v20, v11, t4 1014 vwmacc.vx v18, t4, v3 1015 vwmacc.vx v20, t3, v5 1016 vwmacc.vx v26, t4, v5 1017 1018 vwadd.wx v18, v18, t1 1019 vwadd.wx v20, v20, t1 1020 vwadd.wx v26, v26, t1 1021 vwadd.wx v28, v28, t1 1022 1023 vnsra.wi v18, v18, 12 1024 vnsra.wi v20, v20, 12 1025 vnsra.wi v26, v26, 12 1026 vnsra.wi v28, v28, 12 1027 1028 vssub.vv v5, v18, v20 1029 vsadd.vv v18, v18, v20 1030 vssub.vv v11, v28, v26 1031 vsadd.vv v28, v28, v26 1032 1033 vssub.vv v7, v16, v22 1034 vsadd.vv v16, v16, v22 1035 vssub.vv v9, v30, v24 1036 vsadd.vv v30, v30, v24 1037 1038 vwmul.vx v20, v11, t2 1039 vwmul.vx v22, v9, t2 1040 vwmul.vx v24, v9, t2 1041 vwmul.vx v26, v11, t2 1042 vwmacc.vx v24, t2, v7 1043 vwmacc.vx v26, t2, v5 1044 neg t2, t2 1045 vwmacc.vx v20, t2, v5 1046 vwmacc.vx v22, t2, v7 1047 1048 vwadd.wx v20, v20, t1 1049 vwadd.wx v22, v22, t1 1050 vwadd.wx v24, v24, t1 1051 vwadd.wx v26, v26, t1 1052 1053 vnsra.wi v20, v20, 12 1054 vnsra.wi v22, v22, 12 1055 vnsra.wi v24, v24, 12 1056 vnsra.wi v26, v26, 12 1057 1058 vssub.vv v15, v0, v30 1059 vsadd.vv v0, v0, v30 1060 vssub.vv v17, v2, v28 1061 vsadd.vv v1, v2, v28 1062 vssub.vv v13, v4, v26 1063 vsadd.vv v2, v4, v26 1064 vssub.vv v19, v6, v24 1065 vsadd.vv v3, v6, v24 1066 vssub.vv v11, v8, v22 1067 vsadd.vv v4, v8, v22 1068 vsadd.vv v5, v10, v20 1069 vssub.vv v10, v10, v20 1070 vssub.vv v9, v12, v18 1071 vsadd.vv v6, v12, v18 1072 vssub.vv v8, v14, v16 1073 vsadd.vv v7, v14, v16 1074 vmv.v.v v14, v17 1075 vmv.v.v v12, v19 1076 1077 jr t0 1078endfunc 1079 1080.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 1081 li t1, 4091 1082 li t2, 201 1083 li t3, 3973 1084 li t4, 995 1085 1086 vwmul.vx v16, v15, t1 1087 neg t1, t1 1088 vwmul.vx v18, v15, t2 1089 vwmacc.vx v16, t2, v0 1090 vwmacc.vx v18, t1, v0 1091 1092 vwmul.vx v20, v13, t3 1093 neg t3, t3 1094 vwmul.vx v22, v13, t4 1095 vwmacc.vx v20, t4, v2 1096 vwmacc.vx v22, t3, v2 1097 1098 li t1, 3703 1099 li t2, 1751 1100 li t3, 3290 1101 li t4, 2440 1102 1103 vwmul.vx v24, v11, t1 1104 neg t1, t1 1105 vwmul.vx v26, v11, t2 1106 vwmacc.vx v24, t2, v4 1107 vwmacc.vx v26, t1, v4 1108 1109 vwmul.vx v28, v9, t3 1110 neg t3, t3 1111 vwmul.vx v30, v9, t4 1112 vwmacc.vx v28, t4, v6 1113 vwmacc.vx v30, t3, v6 1114 1115 li t1, 2048 1116 1117 vwadd.wx v16, v16, t1 1118 vwadd.wx v18, v18, t1 1119 vwadd.wx v20, v20, t1 1120 vwadd.wx v22, v22, t1 1121 vwadd.wx v24, v24, t1 1122 vwadd.wx v26, v26, t1 1123 vwadd.wx v28, v28, t1 1124 vwadd.wx v30, v30, t1 1125 1126 vnsra.wi v0, v16, 12 1127 vnsra.wi v18, v18, 12 1128 vnsra.wi v2, v20, 12 1129 vnsra.wi v22, v22, 12 1130 vnsra.wi v4, v24, 12 1131 vnsra.wi v26, v26, 12 1132 vnsra.wi v6, v28, 12 1133 vnsra.wi v30, v30, 12 1134 1135 li t1, 2751 1136 li t2, 3035 1137 li t3, 2106 1138 li t4, 3513 1139 1140 vwmul.vx v16, v7, t1 1141 neg t1, t1 1142 vwmul.vx v20, v7, t2 1143 vwmacc.vx v16, t2, v8 1144 vwmacc.vx v20, t1, v8 1145 1146 vwmul.vx v24, v5, t3 1147 neg t3, t3 1148 vwmul.vx v28, v5, t4 1149 vwmacc.vx v24, t4, v10 1150 vwmacc.vx v28, t3, v10 1151 1152 li t1, 2048 1153 1154 vwadd.wx v16, v16, t1 1155 vwadd.wx v20, v20, t1 1156 vwadd.wx v24, v24, t1 1157 vwadd.wx v28, v28, t1 1158 1159 vnsra.wi v16, v16, 12 1160 vnsra.wi v9, v20, 12 1161 vnsra.wi v24, v24, 12 1162 vnsra.wi v11, v28, 12 1163 1164 vssub.vv v8, v0, v16 1165 vsadd.vv v0, v0, v16 1166 vssub.vv v10, v2, v24 1167 vsadd.vv v2, v2, v24 1168 1169 li t1, 1380 1170 li t2, 3857 1171 li t3, 601 1172 li t4, 4052 1173 1174 vwmul.vx v16, v3, t1 1175 neg t1, t1 1176 vwmul.vx v20, v3, t2 1177 vwmacc.vx v16, t2, v12 1178 vwmacc.vx v20, t1, v12 1179 1180 vwmul.vx v24, v1, t3 1181 neg t3, t3 1182 vwmul.vx v28, v1, t4 1183 vwmacc.vx v24, t4, v14 1184 vwmacc.vx v28, t3, v14 1185 1186 li t1, 2048 1187 1188 vwadd.wx v16, v16, t1 1189 vwadd.wx v20, v20, t1 1190 vwadd.wx v24, v24, t1 1191 vwadd.wx v28, v28, t1 1192 1193 vnsra.wi v16, v16, 12 1194 vnsra.wi v13, v20, 12 1195 vnsra.wi v24, v24, 12 1196 vnsra.wi v15, v28, 12 1197 1198 vssub.vv v12, v4, v16 1199 vsadd.vv v16, v4, v16 1200 vssub.vv v14, v6, v24 1201 vsadd.vv v20, v6, v24 1202 1203 vsadd.vv v1, v18, v9 1204 vssub.vv v9, v18, v9 1205 vsadd.vv v3, v22, v11 1206 vssub.vv v11, v22, v11 1207 vsadd.vv v18, v26, v13 1208 vssub.vv v13, v26, v13 1209 vsadd.vv v22, v30, v15 1210 vssub.vv v15, v30, v15 1211 1212 vssub.vv v4, v0, v16 1213 vsadd.vv v0, v0, v16 1214 vssub.vv v5, v1, v18 1215 vsadd.vv v1, v1, v18 1216 vssub.vv v6, v2, v20 1217 vsadd.vv v2, v2, v20 1218 vssub.vv v7, v3, v22 1219 vsadd.vv v3, v3, v22 1220 1221 li t1, 799 1222 li t2, 4017 1223 li t3, 3406 1224 li t4, 2276 1225 1226 vwmul.vx v16, v8, t2 1227 vwmul.vx v18, v8, t1 1228 vwmul.vx v20, v10, t4 1229 vwmul.vx v22, v10, t3 1230 vwmul.vx v24, v13, t2 1231 vwmul.vx v26, v13, t1 1232 vwmul.vx v28, v15, t4 1233 vwmul.vx v30, v15, t3 1234 vwmacc.vx v16, t1, v9 1235 neg t1, t1 1236 vwmacc.vx v20, t3, v11 1237 neg t3, t3 1238 vwmacc.vx v26, t2, v12 1239 neg t2, t2 1240 vwmacc.vx v30, t4, v14 1241 neg t4, t4 1242 vwmacc.vx v18, t2, v9 1243 vwmacc.vx v22, t4, v11 1244 vwmacc.vx v24, t1, v12 1245 vwmacc.vx v28, t3, v14 1246 1247 li t1, 2048 1248 li t2, 2896 1249 li t3, 1567 1250 li t4, 3784 1251 1252 vwadd.wx v16, v16, t1 1253 vwadd.wx v18, v18, t1 1254 vwadd.wx v20, v20, t1 1255 vwadd.wx v22, v22, t1 1256 vwadd.wx v24, v24, t1 1257 vwadd.wx v26, v26, t1 1258 vwadd.wx v28, v28, t1 1259 vwadd.wx v30, v30, t1 1260 1261 vnsra.wi v16, v16, 12 1262 vnsra.wi v18, v18, 12 1263 vnsra.wi v20, v20, 12 1264 vnsra.wi v22, v22, 12 1265 vnsra.wi v24, v24, 12 1266 vnsra.wi v26, v26, 12 1267 vnsra.wi v28, v28, 12 1268 vnsra.wi v30, v30, 12 1269 1270 vsadd.vv v8, v16, v24 1271 vsadd.vv v9, v18, v26 1272 vsadd.vv v10, v20, v28 1273 vsadd.vv v11, v22, v30 1274 vssub.vv v12, v16, v24 1275 vssub.vv v13, v18, v26 1276 vssub.vv v14, v20, v28 1277 vssub.vv v15, v22, v30 1278 1279 vwmul.vx v16, v4, t4 1280 vwmul.vx v18, v4, t3 1281 vwmul.vx v20, v7, t4 1282 vwmul.vx v22, v7, t3 1283 vwmul.vx v24, v12, t4 1284 vwmul.vx v26, v12, t3 1285 vwmul.vx v28, v15, t4 1286 vwmul.vx v30, v15, t3 1287 vwmacc.vx v16, t3, v5 1288 vwmacc.vx v22, t4, v6 1289 vwmacc.vx v24, t3, v13 1290 neg t3, t3 1291 vwmacc.vx v30, t4, v14 1292 neg t4, t4 1293 vwmacc.vx v20, t3, v6 1294 vwmacc.vx v28, t3, v14 1295 vwmacc.vx v18, t4, v5 1296 vwmacc.vx v26, t4, v13 1297 1298 vwadd.wx v16, v16, t1 1299 vwadd.wx v18, v18, t1 1300 vwadd.wx v20, v20, t1 1301 vwadd.wx v22, v22, t1 1302 vwadd.wx v24, v24, t1 1303 vwadd.wx v26, v26, t1 1304 vwadd.wx v28, v28, t1 1305 vwadd.wx v30, v30, t1 1306 1307 vnsra.wi v16, v16, 12 1308 vnsra.wi v18, v18, 12 1309 vnsra.wi v20, v20, 12 1310 vnsra.wi v22, v22, 12 1311 vnsra.wi v24, v24, 12 1312 vnsra.wi v26, v26, 12 1313 vnsra.wi v28, v28, 12 1314 vnsra.wi v30, v30, 12 1315 1316.ifc \o0, v0 1317 vsadd.vv \o14, v9, v11 1318 vssub.vv v11, v9, v11 1319 vssub.vv v9, v1, v3 1320 vsadd.vv \o15, v1, v3 1321 vsadd.vv \o1, v8, v10 1322 vssub.vv v10, v8, v10 1323 vssub.vv v8, v0, v2 1324 vsadd.vv \o0, v0, v2 1325.else 1326 vsadd.vv \o1, v8, v10 1327 vssub.vv v10, v8, v10 1328 vssub.vv v8, v0, v2 1329 vsadd.vv \o0, v0, v2 1330 vsadd.vv v2, v9, v11 1331 vssub.vv v11, v9, v11 1332 vssub.vv v9, v1, v3 1333 vsadd.vv \o15, v1, v3 1334 vmv.v.v \o14, v2 1335.endif 1336 1337 vsadd.vv \o3, v16, v20 1338 vssub.vv v6, v16, v20 1339 vsadd.vv \o12, v18, v22 1340 vssub.vv v7, v18, v22 1341 vsadd.vv \o2, v24, v28 1342 vssub.vv v24, v24, v28 1343 vsadd.vv \o13, v26, v30 1344 vssub.vv v26, v26, v30 1345 1346 neg t3, t2 1347 1348 vwmul.vx v28, v24, t2 1349 vwmul.vx v30, v24, t2 1350 vwmacc.vx v28, t2, v26 1351 vwmacc.vx v30, t3, v26 1352 1353 vwmul.vx v24, v10, t2 1354 vwmul.vx v26, v10, t2 1355 vwmacc.vx v24, t2, v11 1356 vwmacc.vx v26, t3, v11 1357 1358 vwmul.vx v20, v6, t2 1359 vwmul.vx v22, v6, t2 1360 vwmacc.vx v20, t2, v7 1361 vwmacc.vx v22, t3, v7 1362 1363 vwmul.vx v16, v8, t2 1364 vwmul.vx v18, v8, t2 1365 vwmacc.vx v16, t2, v9 1366 vwmacc.vx v18, t3, v9 1367 1368 vwadd.wx v16, v16, t1 1369 vwadd.wx v18, v18, t1 1370 vwadd.wx v20, v20, t1 1371 vwadd.wx v22, v22, t1 1372 vwadd.wx v24, v24, t1 1373 vwadd.wx v26, v26, t1 1374 vwadd.wx v28, v28, t1 1375 vwadd.wx v30, v30, t1 1376 1377 vnsra.wi \o7, v16, 12 1378 vnsra.wi \o8, v18, 12 1379 vnsra.wi \o4, v20, 12 1380 vnsra.wi \o11, v22, 12 1381 vnsra.wi \o6, v24, 12 1382 vnsra.wi \o9, v26, 12 1383 vnsra.wi \o5, v28, 12 1384 vnsra.wi \o10, v30, 12 1385 1386 vmv.v.x v16, zero 1387 vssub.vv \o1, v16, \o1 1388 vssub.vv \o3, v16, \o3 1389 vssub.vv \o5, v16, \o5 1390 vssub.vv \o7, v16, \o7 1391 vssub.vv \o9, v16, \o9 1392 vssub.vv \o11, v16, \o11 1393 vssub.vv \o13, v16, \o13 1394 vssub.vv \o15, v16, \o15 1395.endm 1396 1397function inv_adst_e16_x16_rvv, export=1, ext=v 1398 iadst_16 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 1399 jr t0 1400endfunc 1401 1402function inv_flipadst_e16_x16_rvv, export=1, ext=v 1403 iadst_16 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0 1404 jr t0 1405endfunc 1406 1407.macro def_horz_16 variant 1408function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v 1409 vmv.v.x v16, zero 1410 vle16.v v0, (t4) 1411 vse16.v v16, (t4) 1412.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1413 add t4, t4, t6 1414 vle16.v v\i, (t4) 1415 vse16.v v16, (t4) 1416.endr 1417.ifc \variant, _identity 1418 li t1, 2*(5793-4096)*8 1419.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1420 vsmul.vx v16, v\i, t1 1421 vsra.vi v16, v16, 1 1422 vaadd.vv v\i, v\i, v16 1423.endr 1424 j L(horz_16x8_epilog) 1425.else 1426 jalr t0, a4 1427.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1428 vssra.vi v\i, v\i, 2 1429.endr 1430L(horz_16x8_epilog): 1431 vsse16.v v0, (t5), t6 1432.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1433 addi t5, t5, 2 1434 vsse16.v v\i, (t5), t6 1435.endr 1436 jr a7 1437.endif 1438endfunc 1439.endm 1440 1441def_horz_16 _identity 1442def_horz_16 1443 1444function inv_txfm_add_vert_8x16_rvv, export=1, ext=v 1445 vsetivli zero, 8, e16, m1, ta, ma 1446 1447 vle16.v v0, (t4) 1448.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1449 add t4, t4, t6 1450 vle16.v v\i, (t4) 1451.endr 1452 1453 jalr t0, a5 1454 1455.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1456 vssra.vi v\i, v\i, 4 1457.endr 1458 1459 vsetivli zero, 8, e8, mf2, ta, ma 1460 1461 vle8.v v16, (t5) 1462 add t0, t5, a1 1463 vle8.v v17, (t0) 1464.irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1465 add t0, t0, a1 1466 vle8.v v\i, (t0) 1467.endr 1468 1469 vwaddu.wv v0, v0, v16 1470 vwaddu.wv v1, v1, v17 1471 vwaddu.wv v2, v2, v18 1472 vwaddu.wv v3, v3, v19 1473 vwaddu.wv v4, v4, v20 1474 vwaddu.wv v5, v5, v21 1475 vwaddu.wv v6, v6, v22 1476 vwaddu.wv v7, v7, v23 1477 vwaddu.wv v8, v8, v24 1478 vwaddu.wv v9, v9, v25 1479 vwaddu.wv v10, v10, v26 1480 vwaddu.wv v11, v11, v27 1481 vwaddu.wv v12, v12, v28 1482 vwaddu.wv v13, v13, v29 1483 vwaddu.wv v14, v14, v30 1484 vwaddu.wv v15, v15, v31 1485 1486 vsetvli zero, zero, e16, m1, ta, ma 1487.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1488 vmax.vx v\i, v\i, zero 1489.endr 1490 1491 vsetvli zero, zero, e8, mf2, ta, ma 1492 vnclipu.wi v16, v0, 0 1493 vnclipu.wi v17, v1, 0 1494 vnclipu.wi v18, v2, 0 1495 vnclipu.wi v19, v3, 0 1496 vnclipu.wi v20, v4, 0 1497 vnclipu.wi v21, v5, 0 1498 vnclipu.wi v22, v6, 0 1499 vnclipu.wi v23, v7, 0 1500 vnclipu.wi v24, v8, 0 1501 vnclipu.wi v25, v9, 0 1502 vnclipu.wi v26, v10, 0 1503 vnclipu.wi v27, v11, 0 1504 vnclipu.wi v28, v12, 0 1505 vnclipu.wi v29, v13, 0 1506 vnclipu.wi v30, v14, 0 1507 vnclipu.wi v31, v15, 0 1508 1509 vse8.v v16, (t5) 1510.irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1511 add t5, t5, a1 1512 vse8.v v\i, (t5) 1513.endr 1514 1515 jr a7 1516endfunc 1517 1518function inv_txfm_add_16x16_rvv, export=1, ext=v 1519 csrw vxrm, zero 1520 vsetivli zero, 8, e16, m1, ta, ma 1521 addi sp, sp, -16*32 1522.irp i, 8, 0 1523 addi t4, a2, \i*2 1524 addi t5, sp, \i*16*2 1525.if \i == 8 1526 blt a3, a7, 1f 1527.endif 1528 li t6, 16*2 1529 jalr a7, a6 1530.if \i == 8 1531 j 2f 15321: 1533 li t1, 64 1534 vsetvli zero, t1, e16, m8, ta, ma 1535 vmv.v.x v0, zero 1536 vse16.v v0, (t5) 1537 addi t5, t5, 128 1538 vse16.v v0, (t5) 1539 vsetivli zero, 8, e16, m1, ta, ma 15402: 1541.endif 1542.endr 1543.irp i, 0, 8 1544 addi t4, sp, \i*2 1545 addi t5, a0, \i 1546 li t6, 16*2 1547 jal a7, inv_txfm_add_vert_8x16_rvv 1548.endr 1549 addi sp, sp, 16*32 1550 ret 1551endfunc 1552 1553.macro def_fn_16x16 txfm1, txfm2, eob_half 1554function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v 1555.ifc \txfm1, identity 1556 la a6, inv_txfm_horz_identity_16x8_rvv 1557.else 1558 la a6, inv_txfm_horz_16x8_rvv 1559 la a4, inv_\txfm1\()_e16_x16_rvv 1560.endif 1561 la a5, inv_\txfm2\()_e16_x16_rvv 1562 li a7, \eob_half 1563 j inv_txfm_add_16x16_rvv 1564endfunc 1565.endm 1566 1567def_fn_16x16 dct, dct, 36 1568def_fn_16x16 identity, identity, 36 1569def_fn_16x16 dct, adst, 36 1570def_fn_16x16 dct, flipadst, 36 1571def_fn_16x16 dct, identity, 8 1572def_fn_16x16 adst, dct, 36 1573def_fn_16x16 adst, adst, 36 1574def_fn_16x16 adst, flipadst, 36 1575def_fn_16x16 flipadst, dct, 36 1576def_fn_16x16 flipadst, adst, 36 1577def_fn_16x16 flipadst, flipadst, 36 1578def_fn_16x16 identity, dct, 8 1579 1580.macro def_fn_416_base variant 1581function inv_txfm_\variant\()add_4x16_rvv, export=1, ext=v 1582 csrw vxrm, zero 1583 1584 vsetivli zero, 8, e16, m1, ta, ma 1585 1586 blt a3, a6, 1f 1587 1588 addi t0, a2, 16 1589 vle16.v v0, (t0) 1590 addi t0, t0, 32 1591 vle16.v v1, (t0) 1592 addi t0, t0, 32 1593 vle16.v v2, (t0) 1594 addi t0, t0, 32 1595 vle16.v v3, (t0) 1596 1597.ifc \variant, identity_ 1598 li t1, (5793-4096)*8 1599 vsmul.vx v8, v0, t1 1600 vaadd.vv v4, v0, v8 1601 vsmul.vx v8, v1, t1 1602 vaadd.vv v5, v1, v8 1603 vsmul.vx v8, v2, t1 1604 vaadd.vv v6, v2, v8 1605 vsmul.vx v8, v3, t1 1606 vaadd.vv v7, v3, v8 1607.else 1608 jalr t0, a4 1609 1610 vssra.vi v4, v0, 1 1611 vssra.vi v5, v1, 1 1612 vssra.vi v6, v2, 1 1613 vssra.vi v7, v3, 1 1614.endif 1615 1616 j 2f 1617 16181: 1619.irp i, 4, 5, 6, 7 1620 vmv.v.x v\i, zero 1621.endr 1622 16232: 1624 vle16.v v0, (a2) 1625 addi t0, a2, 32 1626 vle16.v v1, (t0) 1627 addi t0, t0, 32 1628 vle16.v v2, (t0) 1629 addi t0, t0, 32 1630 vle16.v v3, (t0) 1631 1632.ifc \variant, identity_ 1633 li t1, (5793-4096)*8 1634.irp i, 0, 1, 2, 3 1635 vsmul.vx v8, v\i, t1 1636 vaadd.vv v\i, v\i, v8 1637.endr 1638 1639 j L(itx_4x16_epilog) 1640.else 1641 jalr t0, a4 1642 1643 vssra.vi v0, v0, 1 1644 vssra.vi v1, v1, 1 1645 vssra.vi v2, v2, 1 1646 vssra.vi v3, v3, 1 1647 1648L(itx_4x16_epilog): 1649 vsseg4e16.v v0, (a2) 1650 addi t0, a2, 64 1651 vsseg4e16.v v4, (t0) 1652 1653 vsetivli zero, 4, e16, mf2, ta, ma 1654 1655 vmv.v.x v16, zero 1656 vle16.v v0, (a2) 1657 vse16.v v16, (a2) 1658 addi t0, a2, 8 1659 vle16.v v1, (t0) 1660 vse16.v v16, (t0) 1661.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1662 addi t0, t0, 8 1663 vle16.v v\i, (t0) 1664 vse16.v v16, (t0) 1665.endr 1666 1667 jalr t0, a5 1668 1669.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1670 vssra.vi v\i, v\i, 4 1671.endr 1672 1673 vsetvli zero, zero, e8, mf4, ta, ma 1674 1675 vle8.v v16, (a0) 1676 add t0, a0, a1 1677 vle8.v v17, (t0) 1678.irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1679 add t0, t0, a1 1680 vle8.v v\i, (t0) 1681.endr 1682 1683 vwaddu.wv v0, v0, v16 1684 vwaddu.wv v1, v1, v17 1685 vwaddu.wv v2, v2, v18 1686 vwaddu.wv v3, v3, v19 1687 vwaddu.wv v4, v4, v20 1688 vwaddu.wv v5, v5, v21 1689 vwaddu.wv v6, v6, v22 1690 vwaddu.wv v7, v7, v23 1691 vwaddu.wv v8, v8, v24 1692 vwaddu.wv v9, v9, v25 1693 vwaddu.wv v10, v10, v26 1694 vwaddu.wv v11, v11, v27 1695 vwaddu.wv v12, v12, v28 1696 vwaddu.wv v13, v13, v29 1697 vwaddu.wv v14, v14, v30 1698 vwaddu.wv v15, v15, v31 1699 1700 vsetvli zero, zero, e16, mf2, ta, ma 1701.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1702 vmax.vx v\i, v\i, zero 1703.endr 1704 1705 vsetvli zero, zero, e8, mf4, ta, ma 1706 1707 vnclipu.wi v16, v0, 0 1708 vnclipu.wi v17, v1, 0 1709 vnclipu.wi v18, v2, 0 1710 vnclipu.wi v19, v3, 0 1711 vnclipu.wi v20, v4, 0 1712 vnclipu.wi v21, v5, 0 1713 vnclipu.wi v22, v6, 0 1714 vnclipu.wi v23, v7, 0 1715 vnclipu.wi v24, v8, 0 1716 vnclipu.wi v25, v9, 0 1717 vnclipu.wi v26, v10, 0 1718 vnclipu.wi v27, v11, 0 1719 vnclipu.wi v28, v12, 0 1720 vnclipu.wi v29, v13, 0 1721 vnclipu.wi v30, v14, 0 1722 vnclipu.wi v31, v15, 0 1723 1724 vse8.v v16, (a0) 1725.irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1726 add a0, a0, a1 1727 vse8.v v\i, (a0) 1728.endr 1729 1730 ret 1731.endif 1732endfunc 1733 1734function inv_txfm_\variant\()add_16x4_rvv, export=1, ext=v 1735 csrw vxrm, zero 1736 1737 vsetivli zero, 4, e16, mf2, ta, ma 1738 vle16.v v0, (a2) 1739 addi t0, a2, 8 1740 vle16.v v1, (t0) 1741.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1742 addi t0, t0, 8 1743 vle16.v v\i, (t0) 1744.endr 1745 1746.ifc \variant, identity_ 1747 li t1, 2*(5793-4096)*8 1748.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1749 vsmul.vx v16, v\i, t1 1750 vssra.vi v16, v16, 1 1751 vsadd.vv v\i, v\i, v16 1752.endr 1753 1754 j L(itx_16x4_epilog) 1755.else 1756 jalr t0, a4 1757 1758.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1759 vssra.vi v\i, v\i, 1 1760.endr 1761 1762L(itx_16x4_epilog): 1763 li t0, 32 1764 vssseg8e16.v v0, (a2), t0 1765 addi t1, a2, 16 1766 vssseg8e16.v v8, (t1), t0 1767 1768.irp j, 0, 8 1769 vsetivli zero, 8, e16, m1, ta, ma 1770 1771 vmv.v.x v4, zero 1772 addi t0, a2, \j*2 1773 vle16.v v0, (t0) 1774 vse16.v v4, (t0) 1775.irp i, 1, 2, 3 1776 addi t0, t0, 32 1777 vle16.v v\i, (t0) 1778 vse16.v v4, (t0) 1779.endr 1780 1781 jalr t0, a5 1782 1783 vssra.vi v0, v0, 4 1784 vssra.vi v1, v1, 4 1785 vssra.vi v2, v2, 4 1786 vssra.vi v3, v3, 4 1787 1788 vsetvli zero, zero, e8, mf2, ta, ma 1789 addi t0, a0, \j 1790 vle8.v v4, (t0) 1791 add t0, t0, a1 1792 vle8.v v5, (t0) 1793 add t0, t0, a1 1794 vle8.v v6, (t0) 1795 add t0, t0, a1 1796 vle8.v v7, (t0) 1797 1798 vwaddu.wv v0, v0, v4 1799 vwaddu.wv v1, v1, v5 1800 vwaddu.wv v2, v2, v6 1801 vwaddu.wv v3, v3, v7 1802 1803 vsetvli zero, zero, e16, m1, ta, ma 1804 vmax.vx v0, v0, zero 1805 vmax.vx v1, v1, zero 1806 vmax.vx v2, v2, zero 1807 vmax.vx v3, v3, zero 1808 1809 vsetvli zero, zero, e8, mf2, ta, ma 1810 1811 vnclipu.wi v4, v0, 0 1812 vnclipu.wi v5, v1, 0 1813 vnclipu.wi v6, v2, 0 1814 vnclipu.wi v7, v3, 0 1815 1816 addi t0, a0, \j 1817 vse8.v v4, (t0) 1818 add t0, t0, a1 1819 vse8.v v5, (t0) 1820 add t0, t0, a1 1821 vse8.v v6, (t0) 1822 add t0, t0, a1 1823 vse8.v v7, (t0) 1824.endr 1825 1826 ret 1827.endif 1828endfunc 1829.endm 1830 1831def_fn_416_base identity_ 1832def_fn_416_base 1833 1834.macro def_fn_416 w, h, txfm1, txfm2, eob_half 1835function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1 1836.if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst) 1837 la a4, inv_\txfm1\()_e16_x\w\()w_rvv 1838.elseif \txfm1 != identity 1839 la a4, inv_\txfm1\()_e16_x\w\()_rvv 1840.endif 1841.if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst) 1842 la a5, inv_\txfm2\()_e16_x\h\()w_rvv 1843.else 1844 la a5, inv_\txfm2\()_e16_x\h\()_rvv 1845.endif 1846.if \w == 4 1847 li a6, \eob_half 1848.endif 1849.ifc \txfm1, identity 1850 j inv_txfm_identity_add_\w\()x\h\()_rvv 1851.else 1852 j inv_txfm_add_\w\()x\h\()_rvv 1853.endif 1854endfunc 1855.endm 1856 1857.macro def_fns_416 w, h 1858def_fn_416 \w, \h, dct, dct, 29 1859def_fn_416 \w, \h, identity, identity, 29 1860def_fn_416 \w, \h, dct, adst, 29 1861def_fn_416 \w, \h, dct, flipadst, 29 1862def_fn_416 \w, \h, dct, identity, 8 1863def_fn_416 \w, \h, adst, dct, 29 1864def_fn_416 \w, \h, adst, adst, 29 1865def_fn_416 \w, \h, adst, flipadst, 29 1866def_fn_416 \w, \h, flipadst, dct, 29 1867def_fn_416 \w, \h, flipadst, adst, 29 1868def_fn_416 \w, \h, flipadst, flipadst, 29 1869def_fn_416 \w, \h, identity, dct, 32 1870def_fn_416 \w, \h, adst, identity, 8 1871def_fn_416 \w, \h, flipadst, identity, 8 1872def_fn_416 \w, \h, identity, adst, 32 1873def_fn_416 \w, \h, identity, flipadst, 32 1874.endm 1875 1876def_fns_416 4, 16 1877def_fns_416 16, 4 1878 1879.macro def_fn_816_base variant 1880function inv_txfm_\variant\()add_8x16_rvv, export=1, ext=v 1881 csrw vxrm, zero 1882 1883 vsetivli zero, 8, e16, m1, ta, ma 1884 1885 blt a3, a6, 1f 1886 1887 vmv.v.x v16, zero 1888 addi t0, a2, 16 1889 vle16.v v0, (t0) 1890 vse16.v v16, (t0) 1891.irp i, 1, 2, 3, 4, 5, 6, 7 1892 addi t0, t0, 32 1893 vle16.v v\i, (t0) 1894 vse16.v v16, (t0) 1895.endr 1896 1897 li t1, 2896*8 1898.ifc \variant, identity_ 1899 vsmul.vx v8, v0, t1 1900 vsmul.vx v9, v1, t1 1901 vsmul.vx v10, v2, t1 1902 vsmul.vx v11, v3, t1 1903 vsmul.vx v12, v4, t1 1904 vsmul.vx v13, v5, t1 1905 vsmul.vx v14, v6, t1 1906 vsmul.vx v15, v7, t1 1907.else 1908.irp i, 0, 1, 2, 3, 4, 5, 6, 7 1909 vsmul.vx v\i, v\i, t1 1910.endr 1911 1912 jalr t0, a4 1913 1914 vssra.vi v8, v0, 1 1915 vssra.vi v9, v1, 1 1916 vssra.vi v10, v2, 1 1917 vssra.vi v11, v3, 1 1918 vssra.vi v12, v4, 1 1919 vssra.vi v13, v5, 1 1920 vssra.vi v14, v6, 1 1921 vssra.vi v15, v7, 1 1922.endif 1923 1924 j 2f 1925 19261: 1927.irp i, 8, 9, 10, 11, 12, 13, 14, 15 1928 vmv.v.x v\i, zero 1929.endr 1930 19312: 1932 vmv.v.x v16, zero 1933 vle16.v v0, (a2) 1934 vse16.v v16, (a2) 1935 addi t0, a2, 32 1936 vle16.v v1, (t0) 1937 vse16.v v16, (t0) 1938.irp i, 2, 3, 4, 5, 6, 7 1939 addi t0, t0, 32 1940 vle16.v v\i, (t0) 1941 vse16.v v16, (t0) 1942.endr 1943 1944 li t1, 2896*8 1945.irp i, 0, 1, 2, 3, 4, 5, 6, 7 1946 vsmul.vx v\i, v\i, t1 1947.endr 1948 1949.ifc \variant, identity_ 1950 j L(itx_8x16_epilog) 1951.else 1952 jalr t0, a4 1953 1954.irp i, 0, 1, 2, 3, 4, 5, 6, 7 1955 vssra.vi v\i, v\i, 1 1956.endr 1957 1958L(itx_8x16_epilog): 1959 addi t4, sp, -8*32 1960 vsseg8e16.v v0, (t4) 1961 addi t0, t4, 8*16 1962 vsseg8e16.v v8, (t0) 1963 1964 mv t5, a0 1965 li t6, 16 1966 jal a7, inv_txfm_add_vert_8x16_rvv 1967 1968 ret 1969.endif 1970endfunc 1971 1972function inv_txfm_\variant\()add_16x8_rvv, export=1, ext=v 1973 csrw vxrm, zero 1974 1975 vsetivli zero, 8, e16, m1, ta, ma 1976 vle16.v v0, (a2) 1977 addi t0, a2, 16 1978 vle16.v v1, (t0) 1979.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1980 addi t0, t0, 16 1981 vle16.v v\i, (t0) 1982.endr 1983 1984 li t1, 2896*8 1985.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1986 vsmul.vx v\i, v\i, t1 1987.endr 1988 1989.ifc \variant, identity_ 1990 li t1, 2*(5793-4096)*8 1991.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1992 vsmul.vx v16, v\i, t1 1993 vssra.vi v16, v16, 1 1994 vsadd.vv v\i, v\i, v16 1995.endr 1996 1997 j L(itx_16x8_epilog) 1998.else 1999 jalr t0, a4 2000 2001.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 2002 vssra.vi v\i, v\i, 1 2003.endr 2004 2005L(itx_16x8_epilog): 2006 li t0, 32 2007 vssseg8e16.v v0, (a2), t0 2008 addi t1, a2, 16 2009 vssseg8e16.v v8, (t1), t0 2010 2011.irp j, 0, 8 2012 vsetivli zero, 8, e16, m1, ta, ma 2013 2014 vmv.v.x v8, zero 2015 addi t0, a2, \j*2 2016 vle16.v v0, (t0) 2017 vse16.v v8, (t0) 2018.irp i, 1, 2, 3, 4, 5, 6, 7 2019 addi t0, t0, 32 2020 vle16.v v\i, (t0) 2021 vse16.v v8, (t0) 2022.endr 2023 2024 jalr t0, a5 2025 2026.irp i, 0, 1, 2, 3, 4, 5, 6, 7 2027 vssra.vi v\i, v\i, 4 2028.endr 2029 2030 vsetvli zero, zero, e8, mf2, ta, ma 2031 addi t0, a0, \j 2032 vle8.v v8, (t0) 2033.irp i, 9, 10, 11, 12, 13, 14, 15 2034 add t0, t0, a1 2035 vle8.v v\i, (t0) 2036.endr 2037 2038 vwaddu.wv v0, v0, v8 2039 vwaddu.wv v1, v1, v9 2040 vwaddu.wv v2, v2, v10 2041 vwaddu.wv v3, v3, v11 2042 vwaddu.wv v4, v4, v12 2043 vwaddu.wv v5, v5, v13 2044 vwaddu.wv v6, v6, v14 2045 vwaddu.wv v7, v7, v15 2046 2047 vsetvli zero, zero, e16, m1, ta, ma 2048.irp i, 0, 1, 2, 3, 4, 5, 6, 7 2049 vmax.vx v\i, v\i, zero 2050.endr 2051 2052 vsetvli zero, zero, e8, mf2, ta, ma 2053 2054 vnclipu.wi v8, v0, 0 2055 vnclipu.wi v9, v1, 0 2056 vnclipu.wi v10, v2, 0 2057 vnclipu.wi v11, v3, 0 2058 vnclipu.wi v12, v4, 0 2059 vnclipu.wi v13, v5, 0 2060 vnclipu.wi v14, v6, 0 2061 vnclipu.wi v15, v7, 0 2062 2063 addi t0, a0, \j 2064 vse8.v v8, (t0) 2065.irp i, 9, 10, 11, 12, 13, 14, 15 2066 add t0, t0, a1 2067 vse8.v v\i, (t0) 2068.endr 2069.endr 2070 2071 ret 2072.endif 2073endfunc 2074.endm 2075 2076def_fn_816_base identity_ 2077def_fn_816_base 2078 2079.macro def_fn_816 w, h, txfm1, txfm2, eob_half 2080function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1 2081.ifnc \txfm1, identity 2082 la a4, inv_\txfm1\()_e16_x\w\()_rvv 2083.endif 2084 la a5, inv_\txfm2\()_e16_x\h\()_rvv 2085.if \w == 8 2086 li a6, \eob_half 2087.endif 2088.ifc \txfm1, identity 2089 j inv_txfm_identity_add_\w\()x\h\()_rvv 2090.else 2091 j inv_txfm_add_\w\()x\h\()_rvv 2092.endif 2093endfunc 2094.endm 2095 2096.macro def_fns_816 w, h 2097def_fn_816 \w, \h, dct, dct, 43 2098def_fn_816 \w, \h, identity, identity, 43 2099def_fn_816 \w, \h, dct, adst, 43 2100def_fn_816 \w, \h, dct, flipadst, 43 2101def_fn_816 \w, \h, dct, identity, 8 2102def_fn_816 \w, \h, adst, dct, 43 2103def_fn_816 \w, \h, adst, adst, 43 2104def_fn_816 \w, \h, adst, flipadst, 43 2105def_fn_816 \w, \h, flipadst, dct, 43 2106def_fn_816 \w, \h, flipadst, adst, 43 2107def_fn_816 \w, \h, flipadst, flipadst, 43 2108def_fn_816 \w, \h, identity, dct, 64 2109def_fn_816 \w, \h, adst, identity, 8 2110def_fn_816 \w, \h, flipadst, identity, 8 2111def_fn_816 \w, \h, identity, adst, 64 2112def_fn_816 \w, \h, identity, flipadst, 64 2113.endm 2114 2115def_fns_816 8, 16 2116def_fns_816 16, 8 2117