1/* 2 * Copyright © 2023, VideoLAN and dav1d authors 3 * Copyright © 2023, Loongson Technology Corporation Limited 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/loongarch/loongson_asm.S" 29 30/* 31void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrlowff_t stride, 32 coef *const coeff, const int eob 33 HIGHBD_DECL_SUFFIX) 34*/ 35function inv_txfm_add_wht_wht_4x4_8bpc_lsx 36 vld vr0, a2, 0 37 vld vr2, a2, 16 38 39 vreplgr2vr.h vr20, zero 40 41 vsrai.h vr0, vr0, 2 42 vsrai.h vr2, vr2, 2 43 44 vst vr20, a2, 0 45 46 vpickod.d vr1, vr0, vr0 47 vpickod.d vr3, vr2, vr2 48 49 vadd.h vr4, vr0, vr1 50 vsub.h vr5, vr2, vr3 51 vsub.h vr6, vr4, vr5 52 vsrai.h vr6, vr6, 1 53 vsub.h vr0, vr6, vr3 54 vsub.h vr2, vr6, vr1 55 vsub.h vr1, vr4, vr0 56 vadd.h vr3, vr5, vr2 57 58 vst vr20, a2, 16 59 60 vilvl.h vr4, vr0, vr1 61 vilvl.h vr5, vr3, vr2 62 vilvl.w vr0, vr5, vr4 63 vilvh.w vr2, vr5, vr4 64 vilvh.d vr1, vr0, vr0 65 vilvh.d vr3, vr2, vr2 66 67 vadd.h vr4, vr0, vr1 68 vsub.h vr5, vr2, vr3 69 vsub.h vr6, vr4, vr5 70 vsrai.h vr6, vr6, 1 71 vsub.h vr0, vr6, vr3 72 vsub.h vr2, vr6, vr1 73 vsub.h vr1, vr4, vr0 74 vadd.h vr3, vr5, vr2 75 76 vld vr4, a0, 0 77 vldx vr5, a0, a1 78 alsl.d t0, a1, a0, 1 79 vld vr6, t0, 0 80 vldx vr7, t0, a1 81 82 vsllwil.hu.bu vr4, vr4, 0 83 vsllwil.hu.bu vr5, vr5, 0 84 vsllwil.hu.bu vr6, vr6, 0 85 vsllwil.hu.bu vr7, vr7, 0 86 vilvl.d vr1, vr0, vr1 87 vilvl.d vr2, vr3, vr2 88 vilvl.d vr4, vr5, vr4 89 vilvl.d vr6, vr7, vr6 90 vadd.h vr1, vr1, vr4 91 vadd.h vr2, vr2, vr6 92 vssrani.bu.h vr2, vr1, 0 93 94 vstelm.w vr2, a0, 0, 0 95 add.d a0, a0, a1 96 vstelm.w vr2, a0, 0, 1 97 add.d a0, a0, a1 98 vstelm.w vr2, a0, 0, 2 99 add.d a0, a0, a1 100 vstelm.w vr2, a0, 0, 3 101endfunc 102 103const idct_coeffs, align=4 104 // idct4 105 .word 2896, 2896*8, 1567, 3784 106 // idct8 107 .word 799, 4017, 3406, 2276 108 // idct16 109 .word 401, 4076, 3166, 2598 110 .word 1931, 3612, 3920, 1189 111 // idct32 112 .word 201, 4091, 3035, 2751 113 .word 1751, 3703, 3857, 1380 114 .word 995, 3973, 3513, 2106 115 .word 2440, 3290, 4052, 601 116endconst 117 118.macro vld_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7 119 vld \in0, \src, \start 120 vld \in1, \src, \start+(\stride*1) 121 vld \in2, \src, \start+(\stride*2) 122 vld \in3, \src, \start+(\stride*3) 123 vld \in4, \src, \start+(\stride*4) 124 vld \in5, \src, \start+(\stride*5) 125 vld \in6, \src, \start+(\stride*6) 126 vld \in7, \src, \start+(\stride*7) 127.endm 128 129.macro vst_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7 130 vst \in0, \src, \start 131 vst \in1, \src, \start+(\stride*1) 132 vst \in2, \src, \start+(\stride*2) 133 vst \in3, \src, \start+(\stride*3) 134 vst \in4, \src, \start+(\stride*4) 135 vst \in5, \src, \start+(\stride*5) 136 vst \in6, \src, \start+(\stride*6) 137 vst \in7, \src, \start+(\stride*7) 138.endm 139 140.macro vld_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \ 141 in8, in9, in10, in11, in12, in13, in14, in15 142 143 vld_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 144 145 vld \in8, \src, \start+(\stride*8) 146 vld \in9, \src, \start+(\stride*9) 147 vld \in10, \src, \start+(\stride*10) 148 vld \in11, \src, \start+(\stride*11) 149 vld \in12, \src, \start+(\stride*12) 150 vld \in13, \src, \start+(\stride*13) 151 vld \in14, \src, \start+(\stride*14) 152 vld \in15, \src, \start+(\stride*15) 153.endm 154 155.macro vst_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \ 156 in8, in9, in10, in11, in12, in13, in14, in15 157 158 vst_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 159 160 vst \in8, \src, \start+(\stride*8) 161 vst \in9, \src, \start+(\stride*9) 162 vst \in10, \src, \start+(\stride*10) 163 vst \in11, \src, \start+(\stride*11) 164 vst \in12, \src, \start+(\stride*12) 165 vst \in13, \src, \start+(\stride*13) 166 vst \in14, \src, \start+(\stride*14) 167 vst \in15, \src, \start+(\stride*15) 168.endm 169 170.macro DST_ADD_W4 in0, in1, in2, in3, in4, in5 171 vilvl.w vr10, \in1, \in0 // 0 1 2 3 4 5 6 7 x ... 172 vilvl.w vr12, \in3, \in2 // 8 9 10 11 12 13 14 15 x ... 173 vsllwil.hu.bu vr10, vr10, 0 174 vsllwil.hu.bu vr12, vr12, 0 175 vadd.h vr10, \in4, vr10 176 vadd.h vr12, \in5, vr12 177 vssrani.bu.h vr12, vr10, 0 178 vstelm.w vr12, a0, 0, 0 179 add.d t8, a0, a1 180 vstelm.w vr12, t8, 0, 1 181 vstelm.w vr12, t2, 0, 2 182 add.d t8, t2, a1 183 vstelm.w vr12, t8, 0, 3 184.endm 185 186.macro VLD_DST_ADD_W4 in0, in1 187 vld vr0, a0, 0 188 vldx vr1, a0, a1 189 vld vr2, t2, 0 190 vldx vr3, t2, a1 191 192 DST_ADD_W4 vr0, vr1, vr2, vr3, \in0, \in1 193.endm 194 195.macro dct_4x4_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1 196 vexth.w.h vr4, \in0 // in1 197 vexth.w.h vr5, \in1 // in3 198 vmul.w vr6, vr4, \in4 199 vmul.w vr7, vr4, \in5 200 vmadd.w vr6, vr5, \in5 // t3 201 vmsub.w vr7, vr5, \in4 // t2 202 vsllwil.w.h vr4, \in2, 0 // in0 203 vsllwil.w.h vr5, \in3, 0 // in2 204 vmul.w vr9, vr4, \in6 205 vmul.w vr10, vr4, \in7 206 vmadd.w vr9, vr5, \in7 // t0 207 vmsub.w vr10, vr5, \in6 // t1 208 vssrarni.h.w vr10, vr9, 12 // t0 t1 209 vssrarni.h.w vr7, vr6, 12 // t3 t2 210 vsadd.h \out0, vr10, vr7 // 0 4 8 12 1 5 9 13 c[0] c[1] 211 vssub.h \out1, vr10, vr7 // 3 7 11 15 2 6 10 14 c[3] c[2] 212.endm 213 214.macro inv_dct_dct_4x4_lsx 215 la.local t0, idct_coeffs 216 217 vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 218 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 219 220 vldrepl.w vr2, t0, 8 // 1567 221 vldrepl.w vr3, t0, 12 // 3784 222 vldrepl.w vr8, t0, 0 // 2896 223 224 dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr11, vr12 225 226 vreplgr2vr.h vr15, zero 227 vshuf4i.d vr12, vr12, 0x01 // 2 6 10 14 3 7 11 15 228 vst vr15, a2, 0 229 vst vr15, a2, 16 230 231 vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14 232 vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15 233 vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7 234 vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15 235 236 dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr13, vr14 237 vsrari.h vr13, vr13, 4 238 vsrari.h vr14, vr14, 4 239 vshuf4i.d vr14, vr14, 0x01 240 241 alsl.d t2, a1, a0, 1 242 243 VLD_DST_ADD_W4 vr13, vr14 244.endm 245 246.macro identity_4x4_lsx in0, in1, in2, in3, out0 247 vsllwil.w.h vr2, \in0, 0 248 vexth.w.h vr3, \in1 249 vmul.w vr4, vr2, \in2 250 vmul.w vr5, vr3, \in2 251 vssrarni.h.w vr5, vr4, 12 252 vsadd.h \out0, vr5, \in3 253.endm 254 255.macro inv_identity_identity_4x4_lsx 256 vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 257 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 258 259 li.w t0, 1697 260 vreplgr2vr.w vr20, t0 261 262 identity_4x4_lsx vr0, vr0, vr20, vr0, vr0 263 identity_4x4_lsx vr1, vr1, vr20, vr1, vr1 264 vreplgr2vr.h vr15, zero 265 vst vr15, a2, 0 266 vst vr15, a2, 16 267 identity_4x4_lsx vr0, vr0, vr20, vr0, vr6 268 identity_4x4_lsx vr1, vr1, vr20, vr1, vr7 269 270 vsrari.h vr6, vr6, 4 271 vsrari.h vr7, vr7, 4 272 vilvh.d vr8, vr6, vr6 273 vilvh.d vr9, vr7, vr7 274 vilvl.h vr4, vr8, vr6 275 vilvl.h vr5, vr9, vr7 276 vilvl.w vr6, vr5, vr4 277 vilvh.w vr7, vr5, vr4 278 279 alsl.d t2, a1, a0, 1 280 VLD_DST_ADD_W4 vr6, vr7 281.endm 282 283const iadst4_coeffs, align=4 284 .word 1321, 3803, 2482, 3344 285endconst 286 287.macro adst4x4_1d_lsx in0, in1, in2, in3, out0, out1, out2, out3 288 vsub.w vr6, \in0, \in2 // in0-in2 289 vmul.w vr7, \in0, vr20 // in0*1321 290 vmadd.w vr7, \in2, vr21 // in0*1321+in2*3803 291 vmadd.w vr7, \in3, vr22 // in0*1321+in2*3803+in3*2482 292 vmul.w vr8, \in1, vr23 // in1*3344 293 vadd.w vr6, vr6, \in3 // in0-in2+in3 294 vmul.w vr9, \in0, vr22 // in0*2482 295 vmsub.w vr9, \in2, vr20 // in2*1321 296 vmsub.w vr9, \in3, vr21 // in0*2482-in2*1321-in3*3803 297 vadd.w vr5, vr7, vr9 298 vmul.w \out2, vr6, vr23 // out[2] 8 9 10 11 299 vadd.w \out0, vr7, vr8 // out[0] 0 1 2 3 300 vadd.w \out1, vr9, vr8 // out[1] 4 5 6 7 301 vsub.w \out3, vr5, vr8 // out[3] 12 13 14 15 302.endm 303 304.macro inv_adst_dct_4x4_lsx 305 vld vr0, a2, 0 306 vld vr1, a2, 16 307 308 la.local t0, iadst4_coeffs 309 vsllwil.w.h vr2, vr0, 0 // in0 310 vexth.w.h vr3, vr0 // in1 311 vsllwil.w.h vr4, vr1, 0 // in2 312 vexth.w.h vr5, vr1 // in3 313 vldrepl.w vr20, t0, 0 // 1321 314 vldrepl.w vr21, t0, 4 // 3803 315 vldrepl.w vr22, t0, 8 // 2482 316 vldrepl.w vr23, t0, 12 // 3344 317 318 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 319 320 LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7 321 vssrarni.h.w vr13, vr11, 12 322 vssrarni.h.w vr14, vr12, 12 323 324 vreplgr2vr.h vr15, zero 325 la.local t0, idct_coeffs 326 vst vr15, a2, 0 327 vst vr15, a2, 16 328 vldrepl.w vr20, t0, 8 // 1567 329 vldrepl.w vr21, t0, 12 // 3784 330 vldrepl.w vr22, t0, 0 // 2896 331 332 dct_4x4_core_lsx vr13, vr14, vr13, vr14, vr21, vr20, vr22, vr22, vr13, vr14 333 334 vshuf4i.d vr14, vr14, 0x01 335 vsrari.h vr13, vr13, 4 336 vsrari.h vr14, vr14, 4 337 338 alsl.d t2, a1, a0, 1 339 VLD_DST_ADD_W4 vr13, vr14 340.endm 341 342.macro inv_adst_adst_4x4_lsx 343 vld vr0, a2, 0 344 vld vr1, a2, 16 345 346 la.local t0, iadst4_coeffs 347 vsllwil.w.h vr2, vr0, 0 // in0 348 vexth.w.h vr3, vr0 // in1 349 vsllwil.w.h vr4, vr1, 0 // in2 350 vexth.w.h vr5, vr1 // in3 351 vldrepl.w vr20, t0, 0 // 1321 352 vldrepl.w vr21, t0, 4 // 3803 353 vldrepl.w vr22, t0, 8 // 2482 354 vldrepl.w vr23, t0, 12 // 3344 355 356 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 357 358 LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7 359 360 vsrari.w vr11, vr11, 12 361 vsrari.w vr13, vr13, 12 362 vsrari.w vr12, vr12, 12 363 vsrari.w vr14, vr14, 12 364 365 vreplgr2vr.h vr15, zero 366 vst vr15, a2, 0 367 vst vr15, a2, 16 368 369 adst4x4_1d_lsx vr11, vr13, vr12, vr14, vr11, vr13, vr12, vr14 370 371 vssrarni.h.w vr13, vr11, 12 372 vssrarni.h.w vr14, vr12, 12 373 vsrari.h vr13, vr13, 4 374 vsrari.h vr14, vr14, 4 375 376 alsl.d t2, a1, a0, 1 377 VLD_DST_ADD_W4 vr13, vr14 378.endm 379 380.macro inv_dct_adst_4x4_lsx 381 la.local t0, idct_coeffs 382 383 vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 384 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 385 386 vldrepl.w vr20, t0, 8 // 1567 387 vldrepl.w vr21, t0, 12 // 3784 388 vldrepl.w vr22, t0, 0 // 2896 389 390 dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr21, vr20, vr22, vr22, vr11, vr12 391 392 vreplgr2vr.h vr15, zero 393 vst vr15, a2, 0 394 vst vr15, a2, 16 395 396 vshuf4i.d vr12, vr12, 0x01 // 3 7 11 15 2 6 10 14 397 398 vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14 399 vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15 400 vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7 401 vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15 402 403 vsllwil.w.h vr2, vr11, 0 // in0 404 vexth.w.h vr3, vr11 // in1 405 vsllwil.w.h vr4, vr12, 0 // in2 406 vexth.w.h vr5, vr12 // in3 407 408 la.local t0, iadst4_coeffs 409 410 vldrepl.w vr20, t0, 0 // 1321 411 vldrepl.w vr21, t0, 4 // 3803 412 vldrepl.w vr22, t0, 8 // 2482 413 vldrepl.w vr23, t0, 12 // 3344 414 415 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr11, vr13, vr12, vr14 416 417 vssrarni.h.w vr13, vr11, 12 418 vssrarni.h.w vr14, vr12, 12 419 vsrari.h vr13, vr13, 4 420 vsrari.h vr14, vr14, 4 421 422 alsl.d t2, a1, a0, 1 423 VLD_DST_ADD_W4 vr13, vr14 424.endm 425 426.macro inv_dct_flipadst_4x4_lsx 427 la.local t0, idct_coeffs 428 429 vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 430 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 431 432 vldrepl.w vr20, t0, 8 // 1567 433 vldrepl.w vr21, t0, 12 // 3784 434 vldrepl.w vr22, t0, 0 // 2896 435 436 dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr21, vr20, vr22, vr22, vr11, vr12 437 438 vreplgr2vr.h vr15, zero 439 vst vr15, a2, 0 440 vst vr15, a2, 16 441 442 vshuf4i.d vr12, vr12, 0x01 // 3 7 11 15 2 6 10 14 443 444 vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14 445 vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15 446 vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7 447 vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15 448 vsllwil.w.h vr2, vr11, 0 // in0 449 vexth.w.h vr3, vr11 // in1 450 vsllwil.w.h vr4, vr12, 0 // in2 451 vexth.w.h vr5, vr12 // in3 452 453 la.local t0, iadst4_coeffs 454 455 vldrepl.w vr20, t0, 0 // 1321 456 vldrepl.w vr21, t0, 4 // 3803 457 vldrepl.w vr22, t0, 8 // 2482 458 vldrepl.w vr23, t0, 12 // 3344 459 460 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr11, vr12, vr13, vr14 461 462 vssrarni.h.w vr11, vr12, 12 // 0 1 2 3 4 5 6 7 463 vssrarni.h.w vr13, vr14, 12 // 8 9 10 11 12 13 14 15 464 vsrari.h vr11, vr11, 4 465 vsrari.h vr13, vr13, 4 466 467 alsl.d t2, a1, a0, 1 468 VLD_DST_ADD_W4 vr13, vr11 469.endm 470 471.macro inv_flipadst_adst_4x4_lsx 472 vld vr0, a2, 0 473 vld vr1, a2, 16 474 475 la.local t0, iadst4_coeffs 476 vsllwil.w.h vr2, vr0, 0 // in0 477 vexth.w.h vr3, vr0 // in1 478 vsllwil.w.h vr4, vr1, 0 // in2 479 vexth.w.h vr5, vr1 // in3 480 vldrepl.w vr20, t0, 0 // 1321 481 vldrepl.w vr21, t0, 4 // 3803 482 vldrepl.w vr22, t0, 8 // 2482 483 vldrepl.w vr23, t0, 12 // 3344 484 485 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 486 487 vsrari.w vr0, vr0, 12 488 vsrari.w vr1, vr1, 12 489 vsrari.w vr2, vr2, 12 490 vsrari.w vr3, vr3, 12 491 492 vilvl.w vr4, vr0, vr1 493 vilvh.w vr5, vr0, vr1 494 vilvl.w vr6, vr2, vr3 495 vilvh.w vr7, vr2, vr3 496 vilvl.d vr11, vr4, vr6 497 vilvh.d vr12, vr4, vr6 498 vilvl.d vr13, vr5, vr7 499 vilvh.d vr14, vr5, vr7 500 501 vreplgr2vr.h vr15, zero 502 vst vr15, a2, 0 503 vst vr15, a2, 16 504 505 adst4x4_1d_lsx vr11, vr12, vr13, vr14, vr11, vr13, vr12, vr14 506 507 vssrarni.h.w vr13, vr11, 12 508 vssrarni.h.w vr14, vr12, 12 509 vsrari.h vr13, vr13, 4 510 vsrari.h vr14, vr14, 4 511 512 alsl.d t2, a1, a0, 1 513 VLD_DST_ADD_W4 vr13, vr14 514.endm 515 516.macro inv_adst_flipadst_4x4_lsx 517 vld vr0, a2, 0 518 vld vr1, a2, 16 519 520 la.local t0, iadst4_coeffs 521 vsllwil.w.h vr2, vr0, 0 // in0 522 vexth.w.h vr3, vr0 // in1 523 vsllwil.w.h vr4, vr1, 0 // in2 524 vexth.w.h vr5, vr1 // in3 525 vldrepl.w vr20, t0, 0 // 1321 526 vldrepl.w vr21, t0, 4 // 3803 527 vldrepl.w vr22, t0, 8 // 2482 528 vldrepl.w vr23, t0, 12 // 3344 529 530 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 531 LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7 532 vsrari.w vr11, vr11, 12 533 vsrari.w vr12, vr12, 12 534 vsrari.w vr13, vr13, 12 535 vsrari.w vr14, vr14, 12 536 537 vreplgr2vr.h vr15, zero 538 vst vr15, a2, 0 539 vst vr15, a2, 16 540 541 adst4x4_1d_lsx vr11, vr13, vr12, vr14, vr11, vr12, vr13, vr14 542 543 vssrarni.h.w vr11, vr12, 12 544 vssrarni.h.w vr13, vr14, 12 545 vsrari.h vr11, vr11, 4 546 vsrari.h vr13, vr13, 4 547 548 alsl.d t2, a1, a0, 1 549 VLD_DST_ADD_W4 vr13, vr11 550.endm 551 552.macro inv_flipadst_dct_4x4_lsx 553 vld vr0, a2, 0 554 vld vr1, a2, 16 555 556 la.local t0, iadst4_coeffs 557 vsllwil.w.h vr2, vr0, 0 // in0 558 vexth.w.h vr3, vr0 // in1 559 vsllwil.w.h vr4, vr1, 0 // in2 560 vexth.w.h vr5, vr1 // in3 561 vldrepl.w vr20, t0, 0 // 1321 562 vldrepl.w vr21, t0, 4 // 3803 563 vldrepl.w vr22, t0, 8 // 2482 564 vldrepl.w vr23, t0, 12 // 3344 565 566 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 567 568 vilvl.w vr4, vr0, vr1 569 vilvh.w vr5, vr0, vr1 570 vilvl.w vr6, vr2, vr3 571 vilvh.w vr7, vr2, vr3 572 573 vilvl.d vr11, vr4, vr6 574 vilvh.d vr12, vr4, vr6 575 vilvl.d vr13, vr5, vr7 576 vilvh.d vr14, vr5, vr7 577 578 vssrarni.h.w vr12, vr11, 12 579 vssrarni.h.w vr14, vr13, 12 580 581 vreplgr2vr.h vr15, zero 582 la.local t0, idct_coeffs 583 vst vr15, a2, 0 584 vst vr15, a2, 16 585 vldrepl.w vr20, t0, 8 // 1567 586 vldrepl.w vr21, t0, 12 // 3784 587 vldrepl.w vr22, t0, 0 // 2896 588 589 dct_4x4_core_lsx vr12, vr14, vr12, vr14, vr21, vr20, vr22, vr22, vr13, vr14 590 591 vshuf4i.d vr14, vr14, 0x01 592 vsrari.h vr13, vr13, 4 593 vsrari.h vr14, vr14, 4 594 595 alsl.d t2, a1, a0, 1 596 VLD_DST_ADD_W4 vr13, vr14 597.endm 598 599.macro inv_flipadst_flipadst_4x4_lsx 600 vld vr0, a2, 0 601 vld vr1, a2, 16 602 603 la.local t0, iadst4_coeffs 604 vsllwil.w.h vr2, vr0, 0 // in0 605 vexth.w.h vr3, vr0 // in1 606 vsllwil.w.h vr4, vr1, 0 // in2 607 vexth.w.h vr5, vr1 // in3 608 vldrepl.w vr20, t0, 0 // 1321 609 vldrepl.w vr21, t0, 4 // 3803 610 vldrepl.w vr22, t0, 8 // 2482 611 vldrepl.w vr23, t0, 12 // 3344 612 613 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 614 615 vilvl.w vr4, vr0, vr1 616 vilvh.w vr5, vr0, vr1 617 vilvl.w vr6, vr2, vr3 618 vilvh.w vr7, vr2, vr3 619 vilvl.d vr11, vr4, vr6 620 vilvh.d vr12, vr4, vr6 621 vilvl.d vr13, vr5, vr7 622 vilvh.d vr14, vr5, vr7 623 624 vsrari.w vr11, vr11, 12 625 vsrari.w vr12, vr12, 12 626 vsrari.w vr13, vr13, 12 627 vsrari.w vr14, vr14, 12 628 629 vreplgr2vr.h vr15, zero 630 vst vr15, a2, 0 631 vst vr15, a2, 16 632 633 adst4x4_1d_lsx vr11, vr12, vr13, vr14, vr11, vr12, vr13, vr14 634 635 vssrarni.h.w vr11, vr12, 12 636 vssrarni.h.w vr13, vr14, 12 637 vsrari.h vr11, vr11, 4 638 vsrari.h vr13, vr13, 4 639 640 alsl.d t2, a1, a0, 1 641 VLD_DST_ADD_W4 vr13, vr11 642.endm 643 644.macro inv_dct_identity_4x4_lsx 645 la.local t0, idct_coeffs 646 647 vld vr0, a2, 0 648 vld vr1, a2, 16 649 650 vldrepl.w vr2, t0, 8 // 1567 651 vldrepl.w vr3, t0, 12 // 3784 652 vldrepl.w vr8, t0, 0 // 2896 653 654 dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr11, vr12 655 vshuf4i.d vr12, vr12, 0x01 // 2 6 10 14 3 7 11 15 656 657 vreplgr2vr.h vr15, zero 658 li.w t0, 1697 659 660 vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14 661 vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15 662 vilvl.h vr10, vr5, vr4 // 0 1 2 3 4 5 6 7 663 vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15 664 665 vst vr15, a2, 0 666 vst vr15, a2, 16 667 vreplgr2vr.w vr20, t0 668 669 identity_4x4_lsx vr10, vr10, vr20, vr10, vr6 670 identity_4x4_lsx vr12, vr12, vr20, vr12, vr7 671 vsrari.h vr11, vr6, 4 672 vsrari.h vr13, vr7, 4 673 674 alsl.d t2, a1, a0, 1 675 VLD_DST_ADD_W4 vr11, vr13 676.endm 677 678.macro inv_identity_dct_4x4_lsx 679 vld vr0, a2, 0 680 vld vr1, a2, 16 681 682 li.w t0, 1697 683 vreplgr2vr.w vr20, t0 684 685 identity_4x4_lsx vr0, vr0, vr20, vr0, vr0 686 identity_4x4_lsx vr1, vr1, vr20, vr1, vr1 687 688 vreplgr2vr.h vr15, zero 689 690 vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14 691 vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15 692 vilvl.h vr13, vr5, vr4 // 0 1 2 3 4 5 6 7 693 vilvh.h vr14, vr5, vr4 // 8 9 10 11 12 13 14 15 694 695 vst vr15, a2, 0 696 vst vr15, a2, 16 697 698 la.local t0, idct_coeffs 699 700 vldrepl.w vr20, t0, 8 // 1567 701 vldrepl.w vr21, t0, 12 // 3784 702 vldrepl.w vr22, t0, 0 // 2896 703 704 dct_4x4_core_lsx vr13, vr14, vr13, vr14, vr21, vr20, vr22, vr22, vr13, vr14 705 706 vshuf4i.d vr14, vr14, 0x01 707 vsrari.h vr13, vr13, 4 708 vsrari.h vr14, vr14, 4 709 710 alsl.d t2, a1, a0, 1 711 VLD_DST_ADD_W4 vr13, vr14 712.endm 713 714.macro inv_flipadst_identity_4x4_lsx 715 vld vr0, a2, 0 716 vld vr1, a2, 16 717 718 la.local t0, iadst4_coeffs 719 vsllwil.w.h vr2, vr0, 0 // in0 720 vexth.w.h vr3, vr0 // in1 721 vsllwil.w.h vr4, vr1, 0 // in2 722 vexth.w.h vr5, vr1 // in3 723 vldrepl.w vr20, t0, 0 // 1321 724 vldrepl.w vr21, t0, 4 // 3803 725 vldrepl.w vr22, t0, 8 // 2482 726 vldrepl.w vr23, t0, 12 // 3344 727 728 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr10, vr11, vr12, vr13 729 730 vssrarni.h.w vr12, vr13, 12 731 vssrarni.h.w vr10, vr11, 12 732 733 vilvl.h vr4, vr10, vr12 // 0 2 4 6 8 10 12 14 734 vilvh.h vr5, vr10, vr12 // 1 3 5 7 9 11 13 15 735 vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7 736 vilvh.h vr13, vr5, vr4 // 8 9 10 11 12 13 14 15 737 738 vreplgr2vr.h vr15, zero 739 li.w t0, 1697 740 741 vst vr15, a2, 0 742 vst vr15, a2, 16 743 vreplgr2vr.w vr20, t0 744 745 identity_4x4_lsx vr11, vr11, vr20, vr11, vr6 746 identity_4x4_lsx vr13, vr13, vr20, vr13, vr7 747 vsrari.h vr11, vr6, 4 748 vsrari.h vr13, vr7, 4 749 750 alsl.d t2, a1, a0, 1 751 VLD_DST_ADD_W4 vr11, vr13 752.endm 753 754.macro inv_identity_flipadst_4x4_lsx 755 vld vr0, a2, 0 756 vld vr1, a2, 16 757 758 li.w t0, 1697 759 vreplgr2vr.w vr20, t0 760 761 identity_4x4_lsx vr0, vr0, vr20, vr0, vr0 762 identity_4x4_lsx vr1, vr1, vr20, vr1, vr1 763 764 vilvl.h vr4, vr1, vr0 765 vilvh.h vr5, vr1, vr0 766 vilvl.h vr11, vr5, vr4 767 vilvh.h vr13, vr5, vr4 768 769 vreplgr2vr.h vr15, zero 770 vst vr15, a2, 0 771 vst vr15, a2, 16 772 773 la.local t0, iadst4_coeffs 774 vsllwil.w.h vr2, vr11, 0 // in0 775 vexth.w.h vr3, vr11 // in1 776 vsllwil.w.h vr4, vr13, 0 // in2 777 vexth.w.h vr5, vr13 // in3 778 vldrepl.w vr20, t0, 0 // 1321 779 vldrepl.w vr21, t0, 4 // 3803 780 vldrepl.w vr22, t0, 8 // 2482 781 vldrepl.w vr23, t0, 12 // 3344 782 783 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 784 785 vssrarni.h.w vr0, vr1, 12 // 8 9 10 11 12 13 14 15 786 vssrarni.h.w vr2, vr3, 12 // 0 1 2 3 4 5 6 7 787 vsrari.h vr11, vr0, 4 788 vsrari.h vr13, vr2, 4 789 790 alsl.d t2, a1, a0, 1 791 VLD_DST_ADD_W4 vr13, vr11 792.endm 793 794.macro inv_identity_adst_4x4_lsx 795 vld vr0, a2, 0 796 vld vr1, a2, 16 797 798 li.w t0, 1697 799 vreplgr2vr.w vr20, t0 800 801 identity_4x4_lsx vr0, vr0, vr20, vr0, vr0 802 identity_4x4_lsx vr1, vr1, vr20, vr1, vr1 803 804 vilvl.h vr4, vr1, vr0 805 vilvh.h vr5, vr1, vr0 806 vilvl.h vr11, vr5, vr4 807 vilvh.h vr13, vr5, vr4 808 809 vreplgr2vr.h vr15, zero 810 vst vr15, a2, 0 811 vst vr15, a2, 16 812 813 la.local t0, iadst4_coeffs 814 vsllwil.w.h vr2, vr11, 0 // in0 815 vexth.w.h vr3, vr11 // in1 816 vsllwil.w.h vr4, vr13, 0 // in2 817 vexth.w.h vr5, vr13 // in3 818 vldrepl.w vr20, t0, 0 // 1321 819 vldrepl.w vr21, t0, 4 // 3803 820 vldrepl.w vr22, t0, 8 // 2482 821 vldrepl.w vr23, t0, 12 // 3344 822 823 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 824 825 vssrarni.h.w vr1, vr0, 12 826 vssrarni.h.w vr3, vr2, 12 827 vsrari.h vr11, vr1, 4 828 vsrari.h vr13, vr3, 4 829 830 alsl.d t2, a1, a0, 1 831 VLD_DST_ADD_W4 vr11, vr13 832.endm 833 834.macro inv_adst_identity_4x4_lsx 835 vld vr0, a2, 0 836 vld vr1, a2, 16 837 838 la.local t0, iadst4_coeffs 839 vsllwil.w.h vr2, vr0, 0 // in0 840 vexth.w.h vr3, vr0 // in1 841 vsllwil.w.h vr4, vr1, 0 // in2 842 vexth.w.h vr5, vr1 // in3 843 vldrepl.w vr20, t0, 0 // 1321 844 vldrepl.w vr21, t0, 4 // 3803 845 vldrepl.w vr22, t0, 8 // 2482 846 vldrepl.w vr23, t0, 12 // 3344 847 848 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 849 850 LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7 851 852 vssrarni.h.w vr13, vr11, 12 853 vssrarni.h.w vr14, vr12, 12 854 855 vreplgr2vr.h vr15, zero 856 li.w t0, 1697 857 858 vst vr15, a2, 0 859 vst vr15, a2, 16 860 vreplgr2vr.w vr20, t0 861 862 identity_4x4_lsx vr13, vr13, vr20, vr13, vr6 863 identity_4x4_lsx vr14, vr14, vr20, vr14, vr7 864 vsrari.h vr11, vr6, 4 865 vsrari.h vr13, vr7, 4 866 867 alsl.d t2, a1, a0, 1 868 VLD_DST_ADD_W4 vr11, vr13 869.endm 870 871.macro fun4x4 type1, type2 872function inv_txfm_add_\type1\()_\type2\()_4x4_8bpc_lsx 873.ifc \type1\()_\type2, dct_dct 874 bnez a3, .LLL 875 876 vldi vr0, 0x8b5 // 181 877 ld.h t2, a2, 0 // dc 878 st.h zero, a2, 0 879 vreplgr2vr.w vr1, t2 880 vldi vr3, 0x880 // 128 881 vmul.w vr2, vr0, vr1 882 vld vr10, a0, 0 883 vsrari.w vr2, vr2, 8 884 vldx vr11, a0, a1 885 vmadd.w vr3, vr2, vr0 886 alsl.d t2, a1, a0, 1 887 vssrarni.h.w vr3, vr3, 12 888 vld vr12, t2, 0 889 vldx vr13, t2, a1 890 891 DST_ADD_W4 vr10, vr11, vr12, vr13, vr3, vr3 892 893 b .IDST_\type1\()_\type2\()_4X4_END 894.LLL: 895.endif 896 897 inv_\type1\()_\type2\()_4x4_lsx 898.IDST_\type1\()_\type2\()_4X4_END: 899endfunc 900.endm 901 902fun4x4 dct, dct 903fun4x4 identity, identity 904fun4x4 adst, dct 905fun4x4 dct, adst 906fun4x4 adst, adst 907fun4x4 dct, flipadst 908fun4x4 flipadst, adst 909fun4x4 adst, flipadst 910fun4x4 flipadst, dct 911fun4x4 flipadst, flipadst 912fun4x4 dct, identity 913fun4x4 identity, dct 914fun4x4 flipadst, identity 915fun4x4 identity, flipadst 916fun4x4 identity, adst 917fun4x4 adst, identity 918 919function inv_txfm_add_dct_dct_4x8_8bpc_lsx 920 bnez a3, .NO_HAS_DCONLY_4x8 921 922 ld.h t2, a2, 0 // dc 923 vldi vr0, 0x8b5 // 181 924 vreplgr2vr.w vr1, t2 925 vldi vr5, 0x880 // 128 926 vmul.w vr2, vr0, vr1 927 st.h zero, a2, 0 928 vsrari.w vr2, vr2, 8 929 vld vr10, a0, 0 930 vmul.w vr2, vr2, vr0 931 vldx vr11, a0, a1 932 vsrari.w vr2, vr2, 8 933 alsl.d t2, a1, a0, 1 934 vmadd.w vr5, vr2, vr0 935 vld vr12, t2, 0 936 vssrarni.h.w vr5, vr5, 12 937 vldx vr13, t2, a1 938 939 DST_ADD_W4 vr10, vr11, vr12, vr13, vr5, vr5 940 941 alsl.d a0, a1, a0, 2 942 alsl.d t2, a1, t2, 2 943 944 VLD_DST_ADD_W4 vr5, vr5 945 b .DCT_DCT_4x8_END 946 947.NO_HAS_DCONLY_4x8: 948 // sh=8 sw=4 949 la.local t0, idct_coeffs 950 951 vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 952 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 953 vld vr20, a2, 32 // 16 17 18 19 20 21 22 23 in2 954 vld vr21, a2, 48 // 24 25 26 27 28 29 30 31 in3 955 956 vldrepl.w vr2, t0, 8 // 1567 957 vldrepl.w vr3, t0, 12 // 3784 958 vldrepl.w vr8, t0, 0 // 2896 959 960.macro DCT4_4Wx8H_1D_LSX 961 // in1 in3 962 vsllwil.w.h vr4, vr1, 0 // in1 963 vsllwil.w.h vr5, vr21, 0 // in3 964 vmul.w vr4, vr4, vr8 965 vmul.w vr5, vr5, vr8 966 vsrari.w vr4, vr4, 12 967 vsrari.w vr5, vr5, 12 968 vmul.w vr6, vr4, vr3 969 vmul.w vr7, vr4, vr2 970 vmadd.w vr6, vr5, vr2 // t3 0 1 2 3 971 vmsub.w vr7, vr5, vr3 // t2 0 1 2 3 972 vexth.w.h vr4, vr1 // in1 973 vexth.w.h vr5, vr21 // in3 974 vmul.w vr4, vr4, vr8 975 vmul.w vr5, vr5, vr8 976 vsrari.w vr4, vr4, 12 977 vsrari.w vr5, vr5, 12 978 vmul.w vr9, vr4, vr3 979 vmul.w vr10, vr4, vr2 980 vmadd.w vr9, vr5, vr2 // t3 4 5 6 7 981 vmsub.w vr10, vr5, vr3 // t2 4 5 6 7 982 983 // in0 in2 984 vsllwil.w.h vr4, vr0, 0 // in0 985 vsllwil.w.h vr5, vr20, 0 // in2 986 vmul.w vr4, vr4, vr8 987 vmul.w vr5, vr5, vr8 988 vsrari.w vr4, vr4, 12 989 vsrari.w vr5, vr5, 12 990 vmul.w vr11, vr4, vr8 991 vmul.w vr12, vr4, vr8 992 vmadd.w vr11, vr5, vr8 // t0 0 1 2 3 993 vmsub.w vr12, vr5, vr8 // t1 0 1 2 3 994 vexth.w.h vr4, vr0 // in0 995 vexth.w.h vr5, vr20 // in2 996 vmul.w vr4, vr4, vr8 997 vmul.w vr5, vr5, vr8 998 vsrari.w vr4, vr4, 12 999 vsrari.w vr5, vr5, 12 1000 vmul.w vr13, vr4, vr8 1001 vmul.w vr14, vr4, vr8 1002 vmadd.w vr13, vr5, vr8 // t0 4 5 6 7 1003 vmsub.w vr14, vr5, vr8 // t1 4 5 6 7 1004 vssrarni.h.w vr9, vr6, 12 // t3 1005 vssrarni.h.w vr10, vr7, 12 // t2 1006 vssrarni.h.w vr14, vr12, 12 // t1 1007 vssrarni.h.w vr13, vr11, 12 // t0 1008 vsadd.h vr4, vr13, vr9 // c[0] 0 4 8 12 16 20 24 28 1009 vsadd.h vr5, vr14, vr10 // c[1] 1 5 9 13 17 21 25 29 1010 vssub.h vr20, vr14, vr10 // c[2] 2 6 10 14 18 22 26 30 1011 vssub.h vr21, vr13, vr9 // c[3] 3 7 11 15 19 23 27 31 1012.endm 1013 1014 DCT4_4Wx8H_1D_LSX 1015 1016 vreplgr2vr.h vr22, zero 1017 vst vr22, a2, 0 1018 vst vr22, a2, 16 1019 vst vr22, a2, 32 1020 vst vr22, a2, 48 1021 1022 vilvl.h vr0, vr5, vr4 // 0 1 4 5 8 9 12 13 1023 vilvl.h vr1, vr21, vr20 // 2 3 6 7 10 11 14 15 1024 vilvh.h vr6, vr5, vr4 // 16 17 20 21 24 25 28 29 1025 vilvh.h vr7, vr21, vr20 // 18 19 22 23 26 27 30 31 1026 vilvl.w vr9, vr1, vr0 // 0 1 2 3 4 5 6 7 in0 1027 vilvh.w vr10, vr1, vr0 // 8 9 10 11 12 13 14 15 in1 1028 vilvl.w vr11, vr7, vr6 // 16 17 18 19 20 21 22 23 in2 1029 vilvh.w vr12, vr7, vr6 // 24 25 26 27 28 29 30 31 in3 1030 1031 vilvl.d vr0, vr10, vr9 1032 vilvl.d vr1, vr12, vr11 1033 vilvh.d vr20, vr9, vr11 // in5 in1 1034 vilvh.d vr21, vr12, vr10 // in3 in7 1035 1036.macro DCT8_4Wx8H_1D_LSX 1037 dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr13, vr14 1038 1039 vldrepl.w vr17, t0, 16 // 799 1040 vldrepl.w vr18, t0, 20 // 4017 1041 vldrepl.w vr11, t0, 24 // 3406 1042 vldrepl.w vr12, t0, 28 // 2276 1043 1044 vexth.w.h vr4, vr20 1045 vexth.w.h vr5, vr21 1046 vmul.w vr6, vr4, vr18 // in1 * 4017 1047 vmul.w vr7, vr4, vr17 // in1 * 799 1048 vmadd.w vr6, vr5, vr17 // in7 * 799 1049 vmsub.w vr7, vr5, vr18 // in7 * 4017 1050 vsllwil.w.h vr4, vr20, 0 1051 vsllwil.w.h vr5, vr21, 0 1052 vmul.w vr9, vr4, vr12 1053 vmul.w vr10, vr4, vr11 1054 vmadd.w vr9, vr5, vr11 1055 vmsub.w vr10, vr5, vr12 1056 vssrarni.h.w vr10, vr9, 12 // t6a t5a 1057 vssrarni.h.w vr7, vr6, 12 // t7a t4a 1058 vsadd.h vr15, vr7, vr10 // t7 t4 1059 vssub.h vr16, vr7, vr10 // t6a t5a 1060 1061 vexth.w.h vr4, vr16 // t5a 1062 vsllwil.w.h vr5, vr16, 0 // t6a 1063 vldi vr2, 0x8b5 // 181 1064 vsub.w vr6, vr5, vr4 1065 vadd.w vr7, vr5, vr4 1066 vmul.w vr6, vr6, vr2 1067 vmul.w vr7, vr7, vr2 1068 vssrarni.h.w vr7, vr6, 8 // t5 t6 1069 vaddi.hu vr18, vr7, 0 1070 vshuf4i.d vr7, vr15, 0x06 // t7 t6 1071 vshuf4i.d vr15, vr18, 0x09 // t4 t5 1072 1073 // vr17 -> vr7 vr18 -> vr15 1074 vsadd.h vr4, vr13, vr7 1075 vsadd.h vr5, vr14, vr15 1076 vssub.h vr6, vr14, vr15 1077 vssub.h vr7, vr13, vr7 1078.endm 1079 1080 DCT8_4Wx8H_1D_LSX 1081 1082 vshuf4i.d vr5, vr5, 0x01 1083 vshuf4i.d vr7, vr7, 0x01 1084 1085 vsrari.h vr4, vr4, 4 1086 vsrari.h vr5, vr5, 4 1087 vsrari.h vr6, vr6, 4 1088 vsrari.h vr7, vr7, 4 1089 1090 alsl.d t2, a1, a0, 1 1091 1092 VLD_DST_ADD_W4 vr4, vr5 1093 1094 alsl.d a0, a1, a0, 2 1095 alsl.d t2, a1, t2, 2 1096 1097 VLD_DST_ADD_W4 vr6, vr7 1098.DCT_DCT_4x8_END: 1099endfunc 1100 1101.macro rect2_w4_lsx in0, in1, in2, out0, out1 1102 vsllwil.w.h vr22, \in0, 0 1103 vexth.w.h vr23, \in1 1104 vmul.w vr22, vr22, \in2 1105 vmul.w vr23, vr23, \in2 1106 vsrari.w \out0, vr22, 12 1107 vsrari.w \out1, vr23, 12 1108.endm 1109 1110.macro dct_8x4_core_lsx1 out0, out1, out2, out3 1111 // dct4 stride=1<<1 1112 vmul.w vr0, vr6, vr21 1113 vmul.w vr1, vr6, vr20 1114 vmadd.w vr0, vr10, vr20 // t3 1115 vmsub.w vr1, vr10, vr21 // t2 1116 vmul.w vr2, vr18, vr22 1117 vmul.w vr3, vr18, vr22 1118 vmadd.w vr2, vr8, vr22 // t0 1119 vmsub.w vr3, vr8, vr22 // t1 1120 vssrarni.h.w vr1, vr0, 12 // t3 t2 1121 vssrarni.h.w vr3, vr2, 12 // t0 t1 1122 vsadd.h vr8, vr3, vr1 // t0 t1 1123 vssub.h vr10, vr3, vr1 // t3 t2 1124 1125 vldrepl.w vr20, t0, 16 // 799 1126 vldrepl.w vr21, t0, 20 // 4017 1127 vldrepl.w vr22, t0, 24 // 3406 1128 vldrepl.w vr23, t0, 28 // 2276 1129 1130 vmul.w vr0, vr19, vr21 // in1 * 4017 1131 vmul.w vr1, vr19, vr20 // in1 * 799 1132 vmadd.w vr0, vr11, vr20 // in7 * 799 // t7a 1133 vmsub.w vr1, vr11, vr21 // in7 * 4017 // t4a 1134 vmul.w vr2, vr9, vr23 // in5 * 1138 1135 vmul.w vr3, vr9, vr22 // in5 * 1703 1136 vmadd.w vr2, vr7, vr22 // in3 * 1703 // t6a 1137 vmsub.w vr3, vr7, vr23 // in3 * 1138 // t5a 1138 vssrarni.h.w vr0, vr1, 12 // t4a t7a 1139 vssrarni.h.w vr2, vr3, 12 // t5a t6a 1140 vsadd.h vr9, vr0, vr2 // t4 t7 1141 vssub.h vr11, vr0, vr2 // t5a t6a 1142 1143 vldrepl.w vr22, t0, 0 // 2896 1144 vexth.w.h vr18, vr11 // t6a 1145 vsllwil.w.h vr19, vr11, 0 // t5a 1146 vmul.w vr6, vr18, vr22 1147 vmul.w vr7, vr18, vr22 1148 vmadd.w vr6, vr19, vr22 // t6 1149 vmsub.w vr7, vr19, vr22 // t5 1150 vssrarni.h.w vr6, vr7, 12 // t5 t6 1151 1152 vilvh.d vr11, vr6, vr9 // t7 t6 1153 vilvl.d vr9, vr6, vr9 // t4 t5 1154 1155 vsadd.h \out0, vr8, vr11 // c[0] c[1] 1156 vsadd.h \out1, vr10, vr9 // c[3] c[2] 1157 vssub.h \out2, vr10, vr9 // c[4] c[5] 1158 vssub.h \out3, vr8, vr11 // c[7] c[6] 1159.endm 1160 1161.macro dct_8x4_core_lsx2 in0, in1, in2, in3, in4, in5, in6, in7, \ 1162 out0, out1, out2, out3 1163 vexth.w.h vr4, \in0 // in1 1164 vexth.w.h vr5, \in1 // in3 1165 vmul.w vr6, vr4, \in4 1166 vmul.w vr7, vr4, \in5 1167 vmadd.w vr6, vr5, \in5 // t3 1168 vmsub.w vr7, vr5, \in4 // t2 1169 vexth.w.h vr4, \in2 // in1 1170 vexth.w.h vr5, \in3 // in3 1171 vmul.w vr8, vr4, \in4 1172 vmul.w vr9, vr4, \in5 1173 vmadd.w vr8, vr5, \in5 // t3 1174 vmsub.w vr9, vr5, \in4 // t2 1175 vssrarni.h.w vr8, vr6, 12 // t3 1176 vssrarni.h.w vr9, vr7, 12 // t2 1177 1178 vsllwil.w.h vr4, \in0, 0 1179 vsllwil.w.h vr5, \in1, 0 1180 vmul.w vr11, vr4, \in6 1181 vmul.w vr12, vr4, \in7 1182 vmadd.w vr11, vr5, \in7 // t0 1183 vmsub.w vr12, vr5, \in6 // t1 1184 vsllwil.w.h vr4, \in2, 0 1185 vsllwil.w.h vr5, \in3, 0 1186 vmul.w vr13, vr4, \in6 1187 vmul.w vr14, vr4, \in7 1188 vmadd.w vr13, vr5, \in7 // t0 1189 vmsub.w vr14, vr5, \in6 // t1 1190 vssrarni.h.w vr13, vr11, 12 // t0 1191 vssrarni.h.w vr14, vr12, 12 // t1 1192 1193 vsadd.h \out0, vr13, vr8 1194 vsadd.h \out1, vr14, vr9 1195 vssub.h \out2, vr14, vr9 1196 vssub.h \out3, vr13, vr8 1197.endm 1198 1199.macro DST_ADD_W8 in0, in1, in2, in3, in4, in5, in6, in7 1200 vsllwil.hu.bu vr10, \in0, 0 1201 vsllwil.hu.bu vr11, \in1, 0 1202 vsllwil.hu.bu vr12, \in2, 0 1203 vsllwil.hu.bu vr13, \in3, 0 1204 vadd.h vr10, \in4, vr10 1205 vadd.h vr11, \in5, vr11 1206 vadd.h vr12, \in6, vr12 1207 vadd.h vr13, \in7, vr13 1208 vssrani.bu.h vr11, vr10, 0 1209 vssrani.bu.h vr13, vr12, 0 1210 vstelm.d vr11, a0, 0, 0 1211 add.d t8, a0, a1 1212 vstelm.d vr11, t8, 0, 1 1213 vstelm.d vr13, t2, 0, 0 1214 add.d t8, t2, a1 1215 vstelm.d vr13, t8, 0, 1 1216.endm 1217 1218.macro VLD_DST_ADD_W8 in0, in1, in2, in3 1219 vld vr0, a0, 0 1220 vldx vr1, a0, a1 1221 vld vr2, t2, 0 1222 vldx vr3, t2, a1 1223 1224 DST_ADD_W8 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3 1225.endm 1226 1227function inv_txfm_add_dct_dct_8x4_8bpc_lsx 1228 bnez a3, .NO_HAS_DCONLY_8x4 1229 1230 ld.h t2, a2, 0 // dc 1231 vldi vr0, 0x8b5 // 181 1232 vreplgr2vr.w vr1, t2 1233 vldi vr5, 0x880 // 128 1234 vmul.w vr2, vr0, vr1 1235 st.h zero, a2, 0 1236 vsrari.w vr2, vr2, 8 1237 vld vr10, a0, 0 1238 vmul.w vr2, vr2, vr0 1239 vldx vr11, a0, a1 1240 vsrari.w vr2, vr2, 8 1241 alsl.d t2, a1, a0, 1 1242 vmadd.w vr5, vr2, vr0 1243 vld vr12, t2, 0 1244 vssrarni.h.w vr5, vr5, 12 1245 vldx vr13, t2, a1 1246 1247 DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 1248 1249 b .DCT_DCT_8X4_END 1250 1251.NO_HAS_DCONLY_8x4: 1252 la.local t0, idct_coeffs 1253 1254 vld vr0, a2, 0 1255 vld vr1, a2, 16 1256 vld vr2, a2, 32 1257 vld vr3, a2, 48 1258 1259 vldrepl.w vr20, t0, 0 // 2896 1260 1261 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 1262 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 1263 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 1264 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 1265 1266 vldrepl.w vr20, t0, 8 // 1567 1267 vldrepl.w vr21, t0, 12 // 3784 1268 vldrepl.w vr22, t0, 0 // 2896 1269 1270 dct_8x4_core_lsx1 vr0, vr1, vr2, vr3 1271 1272 vshuf4i.d vr1, vr1, 0x01 1273 vshuf4i.d vr3, vr3, 0x01 1274 1275 vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14 1276 vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15 1277 vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7 in0 1278 vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15 in1 1279 vilvl.h vr4, vr3, vr2 // 0 2 4 6 8 10 12 14 1280 vilvh.h vr5, vr3, vr2 // 1 3 5 7 9 11 13 15 1281 vilvl.h vr2, vr5, vr4 // 16 - 23 in2 1282 vilvh.h vr3, vr5, vr4 // 24 - 31 in3 1283 1284 la.local t0, idct_coeffs 1285 1286 vreplgr2vr.h vr23, zero 1287 vst vr23, a2, 0 1288 vst vr23, a2, 16 1289 vst vr23, a2, 32 1290 vst vr23, a2, 48 1291 1292 vldrepl.w vr20, t0, 8 // 1567 1293 vldrepl.w vr21, t0, 12 // 3784 1294 1295 dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \ 1296 vr22, vr15, vr16, vr17, vr18 1297 1298 vsrari.h vr15, vr15, 4 1299 vsrari.h vr16, vr16, 4 1300 vsrari.h vr17, vr17, 4 1301 vsrari.h vr18, vr18, 4 1302 1303 alsl.d t2, a1, a0, 1 1304 1305 VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 1306 1307.DCT_DCT_8X4_END: 1308endfunc 1309 1310.macro identity8_lsx in0, in1, in2, in3, in4, in5, in6, in7, \ 1311 out0, out1, out2, out3 1312 vssrarni.h.w \in1, \in0, 0 1313 vssrarni.h.w \in3, \in2, 0 1314 vssrarni.h.w \in5, \in4, 0 1315 vssrarni.h.w \in7, \in6, 0 1316 vsadd.h \out0, \in1, \in1 1317 vsadd.h \out1, \in3, \in3 1318 vsadd.h \out2, \in5, \in5 1319 vsadd.h \out3, \in7, \in7 1320.endm 1321 1322function inv_txfm_add_identity_identity_8x4_8bpc_lsx 1323 la.local t0, idct_coeffs 1324 1325 vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 1326 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 1327 vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 1328 vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 1329 1330 vldrepl.w vr20, t0, 0 // 2896 1331 1332 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 1333 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 1334 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 1335 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 1336 1337 identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \ 1338 vr19, vr7, vr9, vr11 1339 1340 vreplgr2vr.h vr23, zero 1341 vst vr23, a2, 0 1342 vst vr23, a2, 16 1343 vst vr23, a2, 32 1344 vst vr23, a2, 48 1345 1346 li.w t0, 1697 1347 vreplgr2vr.w vr20, t0 1348 identity_4x4_lsx vr19, vr19, vr20, vr19, vr19 1349 identity_4x4_lsx vr7, vr7, vr20, vr7, vr7 1350 identity_4x4_lsx vr9, vr9, vr20, vr9, vr9 1351 identity_4x4_lsx vr11, vr11, vr20, vr11, vr11 1352 1353 vsrari.h vr15, vr19, 4 1354 vsrari.h vr16, vr7, 4 1355 vsrari.h vr17, vr9, 4 1356 vsrari.h vr18, vr11, 4 1357 1358 vilvl.h vr4, vr16, vr15 1359 vilvh.h vr5, vr16, vr15 1360 vilvl.h vr11, vr5, vr4 1361 vilvh.h vr12, vr5, vr4 1362 vilvl.h vr4, vr18, vr17 1363 vilvh.h vr5, vr18, vr17 1364 vilvl.h vr13, vr5, vr4 1365 vilvh.h vr14, vr5, vr4 1366 vilvl.d vr15, vr13, vr11 1367 vilvh.d vr16, vr13, vr11 1368 vilvl.d vr17, vr14, vr12 1369 vilvh.d vr18, vr14, vr12 1370 1371 alsl.d t2, a1, a0, 1 1372 1373 VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 1374endfunc 1375 1376const iadst8_coeffs, align=4 1377 .word 4076, 401, 3612, 1931 1378 .word 2598, 3166, 1189, 3920 1379 // idct_coeffs 1380 .word 2896, 0, 1567, 3784, 0, 0, 0, 0 1381endconst 1382 1383.macro vmadd_vmsub_vssrarni_hw_12 in0, in1, in2, in3, in4, in5, in6, in7, \ 1384 in8, in9, in10, in11, out0, out1, out2, out3 1385 vmul.w \out0, \in0, \in4 1386 vmul.w \out1, \in0, \in5 1387 vmadd.w \out0, \in1, \in6 // t0a 1388 vmsub.w \out1, \in1, \in7 // t1a 1389 vmul.w \out2, \in2, \in8 1390 vmul.w \out3, \in2, \in9 1391 vmadd.w \out2, \in3, \in10 // t2a 1392 vmsub.w \out3, \in3, \in11 // t3a 1393 vssrarni.h.w \out1, \out0, 12 // t0a t1a 1394 vssrarni.h.w \out3, \out2, 12 // t2a t3a 1395.endm 1396 1397.macro adst8x4_1d_lsx 1398 la.local t0, iadst8_coeffs 1399 1400 vldrepl.w vr20, t0, 0 // 4076 1401 vldrepl.w vr21, t0, 4 // 401 1402 vldrepl.w vr22, t0, 8 // 3612 1403 vldrepl.w vr23, t0, 12 // 1931 1404 1405 // vr13 t0a t1a vr15 t2a t3a 1406 vmadd_vmsub_vssrarni_hw_12 vr11, vr18, vr9, vr6, vr20, vr21, vr21, vr20, \ 1407 vr22, vr23, vr23, vr22, vr12, vr13, vr14, vr15 1408 vldrepl.w vr20, t0, 16 // 2598 1409 vldrepl.w vr21, t0, 20 // 3166 1410 vldrepl.w vr22, t0, 24 // 1189 1411 vldrepl.w vr23, t0, 28 // 3920 1412 1413 // vr18 t4a t5a vr6 t6a t7a 1414 vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr19, vr10, vr20, vr21, vr21, vr20, \ 1415 vr22, vr23, vr23, vr22, vr11, vr18, vr9, vr6 1416 1417 vsadd.h vr12, vr13, vr18 // t0 t1 1418 vsadd.h vr14, vr15, vr6 // t2 t3 1419 vssub.h vr16, vr13, vr18 // t4 t5 1420 vssub.h vr18, vr15, vr6 // t6 t7 1421 1422 la.local t0, idct_coeffs 1423 1424 vldrepl.w vr20, t0, 8 // 1567 1425 vldrepl.w vr21, t0, 12 // 3784 1426 vldrepl.w vr22, t0, 0 // 2896 1427 1428 vsllwil.w.h vr7, vr16, 0 // t4 1429 vexth.w.h vr8, vr16 // t5 1430 vsllwil.w.h vr10, vr18, 0 // t6 1431 vexth.w.h vr11, vr18 // t7 1432 1433 // vr13 out0 out7 vr17 out1 out6 1434 vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr11, vr10, vr21, vr20, vr20, vr21, \ 1435 vr20, vr21, vr21, vr20, vr13, vr15, vr17, vr19 1436 vshuf4i.d vr19, vr19, 0x01 1437 1438 vsadd.h vr13, vr12, vr14 // out0 out7 1439 vssub.h vr16, vr12, vr14 // t2 t3 1440 vsadd.h vr17, vr15, vr19 // out1 out6 1441 vssub.h vr18, vr15, vr19 // t6 t7 1442 1443 vexth.w.h vr20, vr13 // out7 1444 vsllwil.w.h vr21, vr17, 0 // out1 1445 vneg.w vr20, vr20 1446 vneg.w vr21, vr21 1447 vssrarni.h.w vr21, vr20, 0 // out7 out1 1448 vilvl.d vr13, vr21, vr13 // out0 out7 1449 vilvh.d vr17, vr17, vr21 // out1 out6 1450 1451 vsllwil.w.h vr7, vr16, 0 // t2 1452 vexth.w.h vr8, vr16 // t3 1453 vsllwil.w.h vr10, vr18, 0 // t6 1454 vexth.w.h vr11, vr18 // t7 1455 1456 // vr15 out[3] out[4] vr18 out[2] out[5] 1457 vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr10, vr11, vr22, vr22, vr22, vr22, \ 1458 vr22, vr22, vr22, vr22, vr14, vr15, vr19, vr18 1459 1460 vexth.w.h vr20, vr18 // out5 1461 vsllwil.w.h vr21, vr15, 0 // out3 1462 vneg.w vr20, vr20 1463 vneg.w vr21, vr21 1464 vssrarni.h.w vr21, vr20, 0 // out5 out3 1465 vilvl.d vr18, vr21, vr18 // out2 out5 1466 vilvh.d vr15, vr15, vr21 // out3 out4 1467.endm 1468 1469function inv_txfm_add_adst_dct_8x4_8bpc_lsx 1470 vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 1471 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 1472 vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 1473 vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 1474 1475 la.local t0, idct_coeffs 1476 vldrepl.w vr20, t0, 0 // 2896 1477 1478 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 1479 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 1480 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 1481 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 1482 1483 adst8x4_1d_lsx 1484 1485 vilvl.h vr4, vr17, vr13 1486 vilvl.h vr5, vr15, vr18 1487 vilvl.w vr0, vr5, vr4 1488 vilvh.w vr1, vr5, vr4 1489 vilvh.h vr4, vr18, vr15 1490 vilvh.h vr5, vr13, vr17 1491 vilvl.w vr2, vr5, vr4 1492 vilvh.w vr3, vr5, vr4 1493 1494 vreplgr2vr.h vr23, zero 1495 vst vr23, a2, 0 1496 vst vr23, a2, 16 1497 vst vr23, a2, 32 1498 vst vr23, a2, 48 1499 1500 la.local t0, idct_coeffs 1501 1502 vldrepl.w vr20, t0, 8 // 1567 1503 vldrepl.w vr21, t0, 12 // 3784 1504 vldrepl.w vr22, t0, 0 // 2896 1505 1506 dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \ 1507 vr22, vr15, vr16, vr17, vr18 1508 1509 vsrari.h vr15, vr15, 4 1510 vsrari.h vr16, vr16, 4 1511 vsrari.h vr17, vr17, 4 1512 vsrari.h vr18, vr18, 4 1513 1514 alsl.d t2, a1, a0, 1 1515 1516 VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 1517endfunc 1518 1519function inv_txfm_add_dct_adst_8x4_8bpc_lsx 1520 vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 1521 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 1522 vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 1523 vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 1524 1525 la.local t0, idct_coeffs 1526 vldrepl.w vr20, t0, 0 // 2896 1527 1528 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 1529 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 1530 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 1531 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 1532 1533 vldrepl.w vr20, t0, 8 // 1567 1534 vldrepl.w vr21, t0, 12 // 3784 1535 vldrepl.w vr22, t0, 0 // 2896 1536 1537 dct_8x4_core_lsx1 vr0, vr1, vr2, vr3 1538 1539 vshuf4i.d vr1, vr1, 0x01 1540 vshuf4i.d vr3, vr3, 0x01 1541 1542 vilvl.h vr4, vr1, vr0 1543 vilvh.h vr5, vr1, vr0 1544 vilvl.h vr0, vr5, vr4 1545 vilvh.h vr1, vr5, vr4 1546 vilvl.h vr4, vr3, vr2 1547 vilvh.h vr5, vr3, vr2 1548 vilvl.h vr2, vr5, vr4 1549 vilvh.h vr3, vr5, vr4 1550 1551 la.local t0, iadst4_coeffs 1552 1553 vreplgr2vr.h vr23, zero 1554 vst vr23, a2, 0 1555 vst vr23, a2, 16 1556 vst vr23, a2, 32 1557 vst vr23, a2, 48 1558 1559 vldrepl.w vr20, t0, 0 // 1321 1560 vldrepl.w vr21, t0, 4 // 3803 1561 vldrepl.w vr22, t0, 8 // 2482 1562 vldrepl.w vr23, t0, 12 // 3344 1563 1564 vsllwil.w.h vr10, vr0, 0 1565 vexth.w.h vr11, vr0 1566 vsllwil.w.h vr12, vr1, 0 1567 vexth.w.h vr13, vr1 1568 1569 adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 1570 1571 vsllwil.w.h vr14, vr2, 0 1572 vexth.w.h vr15, vr2 1573 vsllwil.w.h vr16, vr3, 0 1574 vexth.w.h vr17, vr3 1575 1576 adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 1577 1578 vssrarni.h.w vr14, vr10, 12 1579 vssrarni.h.w vr15, vr11, 12 1580 vssrarni.h.w vr16, vr12, 12 1581 vssrarni.h.w vr17, vr13, 12 1582 1583 vsrari.h vr14, vr14, 4 1584 vsrari.h vr15, vr15, 4 1585 vsrari.h vr16, vr16, 4 1586 vsrari.h vr17, vr17, 4 1587 1588 alsl.d t2, a1, a0, 1 1589 1590 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 1591endfunc 1592 1593function inv_txfm_add_adst_adst_8x4_8bpc_lsx 1594 vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 1595 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 1596 vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 1597 vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 1598 1599 la.local t0, idct_coeffs 1600 vldrepl.w vr20, t0, 0 // 2896 1601 1602 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 1603 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 1604 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 1605 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 1606 1607 adst8x4_1d_lsx 1608 1609 vilvl.h vr4, vr17, vr13 1610 vilvl.h vr5, vr15, vr18 1611 vilvl.w vr0, vr5, vr4 1612 vilvh.w vr1, vr5, vr4 1613 vilvh.h vr4, vr18, vr15 1614 vilvh.h vr5, vr13, vr17 1615 vilvl.w vr2, vr5, vr4 1616 vilvh.w vr3, vr5, vr4 1617 1618 la.local t0, iadst4_coeffs 1619 1620 vreplgr2vr.h vr23, zero 1621 vst vr23, a2, 0 1622 vst vr23, a2, 16 1623 vst vr23, a2, 32 1624 vst vr23, a2, 48 1625 1626 vldrepl.w vr20, t0, 0 // 1321 1627 vldrepl.w vr21, t0, 4 // 3803 1628 vldrepl.w vr22, t0, 8 // 2482 1629 vldrepl.w vr23, t0, 12 // 3344 1630 1631 vsllwil.w.h vr10, vr0, 0 1632 vexth.w.h vr11, vr0 1633 vsllwil.w.h vr12, vr1, 0 1634 vexth.w.h vr13, vr1 1635 1636 adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 1637 1638 vsllwil.w.h vr14, vr2, 0 1639 vexth.w.h vr15, vr2 1640 vsllwil.w.h vr16, vr3, 0 1641 vexth.w.h vr17, vr3 1642 1643 adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 1644 1645 vssrarni.h.w vr14, vr10, 12 1646 vssrarni.h.w vr15, vr11, 12 1647 vssrarni.h.w vr16, vr12, 12 1648 vssrarni.h.w vr17, vr13, 12 1649 1650 vsrari.h vr14, vr14, 4 1651 vsrari.h vr15, vr15, 4 1652 vsrari.h vr16, vr16, 4 1653 vsrari.h vr17, vr17, 4 1654 1655 alsl.d t2, a1, a0, 1 1656 1657 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 1658endfunc 1659 1660function inv_txfm_add_flipadst_adst_8x4_8bpc_lsx 1661 vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 1662 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 1663 vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 1664 vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 1665 1666 la.local t0, idct_coeffs 1667 vldrepl.w vr20, t0, 0 // 2896 1668 1669 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 1670 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 1671 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 1672 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 1673 1674 adst8x4_1d_lsx 1675 1676 vilvl.h vr20, vr15, vr13 1677 vilvl.h vr21, vr18, vr17 1678 vilvl.w vr0, vr21, vr20 1679 vilvh.w vr1, vr21, vr20 1680 vilvh.h vr20, vr15, vr13 1681 vilvh.h vr21, vr18, vr17 1682 vilvl.w vr2, vr21, vr20 1683 vilvh.w vr3, vr21, vr20 1684 vshuf4i.h vr0, vr0, 0x2d 1685 vshuf4i.h vr1, vr1, 0x2d 1686 vshuf4i.h vr2, vr2, 0x78 1687 vshuf4i.h vr3, vr3, 0x78 1688 1689 la.local t0, iadst4_coeffs 1690 1691 vreplgr2vr.h vr23, zero 1692 vst vr23, a2, 0 1693 vst vr23, a2, 16 1694 vst vr23, a2, 32 1695 vst vr23, a2, 48 1696 1697 vldrepl.w vr20, t0, 0 // 1321 1698 vldrepl.w vr21, t0, 4 // 3803 1699 vldrepl.w vr22, t0, 8 // 2482 1700 vldrepl.w vr23, t0, 12 // 3344 1701 1702 vsllwil.w.h vr10, vr2, 0 1703 vexth.w.h vr11, vr2 1704 vsllwil.w.h vr12, vr3, 0 1705 vexth.w.h vr13, vr3 1706 1707 adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 1708 1709 vsllwil.w.h vr14, vr0, 0 1710 vexth.w.h vr15, vr0 1711 vsllwil.w.h vr16, vr1, 0 1712 vexth.w.h vr17, vr1 1713 1714 adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 1715 1716 vssrarni.h.w vr14, vr10, 12 1717 vssrarni.h.w vr15, vr11, 12 1718 vssrarni.h.w vr16, vr12, 12 1719 vssrarni.h.w vr17, vr13, 12 1720 1721 vsrari.h vr14, vr14, 4 1722 vsrari.h vr15, vr15, 4 1723 vsrari.h vr16, vr16, 4 1724 vsrari.h vr17, vr17, 4 1725 1726 alsl.d t2, a1, a0, 1 1727 1728 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 1729endfunc 1730 1731function inv_txfm_add_adst_flipadst_8x4_8bpc_lsx 1732 vld vr0, a2, 0 // in0 1733 vld vr1, a2, 16 // in1 1734 vld vr2, a2, 32 // in2 1735 vld vr3, a2, 48 // in3 1736 1737 la.local t0, idct_coeffs 1738 vldrepl.w vr20, t0, 0 // 2896 1739 1740 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 1741 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 1742 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 1743 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 1744 1745 adst8x4_1d_lsx 1746 1747 vilvl.h vr4, vr17, vr13 1748 vilvl.h vr5, vr15, vr18 1749 vilvl.w vr0, vr5, vr4 1750 vilvh.w vr1, vr5, vr4 1751 vilvh.h vr4, vr18, vr15 1752 vilvh.h vr5, vr13, vr17 1753 vilvl.w vr2, vr5, vr4 1754 vilvh.w vr3, vr5, vr4 1755 1756 la.local t0, iadst4_coeffs 1757 1758 vreplgr2vr.h vr23, zero 1759 vst vr23, a2, 0 1760 vst vr23, a2, 16 1761 vst vr23, a2, 32 1762 vst vr23, a2, 48 1763 1764 vldrepl.w vr20, t0, 0 // 1321 1765 vldrepl.w vr21, t0, 4 // 3803 1766 vldrepl.w vr22, t0, 8 // 2482 1767 vldrepl.w vr23, t0, 12 // 3344 1768 1769 vsllwil.w.h vr10, vr0, 0 1770 vexth.w.h vr11, vr0 1771 vsllwil.w.h vr12, vr1, 0 1772 vexth.w.h vr13, vr1 1773 1774 adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 1775 1776 vsllwil.w.h vr14, vr2, 0 1777 vexth.w.h vr15, vr2 1778 vsllwil.w.h vr16, vr3, 0 1779 vexth.w.h vr17, vr3 1780 1781 adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 1782 1783 vssrarni.h.w vr14, vr10, 12 1784 vssrarni.h.w vr15, vr11, 12 1785 vssrarni.h.w vr16, vr12, 12 1786 vssrarni.h.w vr17, vr13, 12 1787 1788 vsrari.h vr14, vr14, 4 1789 vsrari.h vr15, vr15, 4 1790 vsrari.h vr16, vr16, 4 1791 vsrari.h vr17, vr17, 4 1792 1793 alsl.d t2, a1, a0, 1 1794 1795 VLD_DST_ADD_W8 vr17, vr16, vr15, vr14 1796endfunc 1797 1798function inv_txfm_add_flipadst_dct_8x4_8bpc_lsx 1799 vld vr0, a2, 0 // in0 1800 vld vr1, a2, 16 // in1 1801 vld vr2, a2, 32 // in2 1802 vld vr3, a2, 48 // in3 1803 1804 la.local t0, idct_coeffs 1805 vldrepl.w vr20, t0, 0 // 2896 1806 1807 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 1808 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 1809 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 1810 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 1811 1812 adst8x4_1d_lsx 1813 1814 vilvl.h vr20, vr15, vr13 1815 vilvl.h vr21, vr18, vr17 1816 vilvl.w vr0, vr21, vr20 1817 vilvh.w vr1, vr21, vr20 1818 vilvh.h vr20, vr15, vr13 1819 vilvh.h vr21, vr18, vr17 1820 vilvl.w vr2, vr21, vr20 1821 vilvh.w vr3, vr21, vr20 1822 vshuf4i.h vr0, vr0, 0x2d 1823 vshuf4i.h vr1, vr1, 0x2d 1824 vshuf4i.h vr2, vr2, 0x78 1825 vshuf4i.h vr3, vr3, 0x78 1826 1827 vreplgr2vr.h vr23, zero 1828 vst vr23, a2, 0 1829 vst vr23, a2, 16 1830 vst vr23, a2, 32 1831 vst vr23, a2, 48 1832 1833 la.local t0, idct_coeffs 1834 1835 vldrepl.w vr20, t0, 8 // 1567 1836 vldrepl.w vr21, t0, 12 // 3784 1837 vldrepl.w vr22, t0, 0 // 2896 1838 1839 dct_8x4_core_lsx2 vr2, vr3, vr0, vr1, vr21, vr20, vr22, \ 1840 vr22, vr15, vr16, vr17, vr18 1841 1842 vsrari.h vr15, vr15, 4 1843 vsrari.h vr16, vr16, 4 1844 vsrari.h vr17, vr17, 4 1845 vsrari.h vr18, vr18, 4 1846 1847 alsl.d t2, a1, a0, 1 1848 1849 VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 1850endfunc 1851 1852function inv_txfm_add_dct_flipadst_8x4_8bpc_lsx 1853 la.local t0, idct_coeffs 1854 1855 vld vr0, a2, 0 // in0 1856 vld vr1, a2, 16 // in1 1857 vld vr2, a2, 32 // in2 1858 vld vr3, a2, 48 // in3 1859 1860 vldrepl.w vr20, t0, 0 // 2896 1861 1862 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 1863 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 1864 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 1865 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 1866 1867 vldrepl.w vr20, t0, 8 // 1567 1868 vldrepl.w vr21, t0, 12 // 3784 1869 vldrepl.w vr22, t0, 0 // 2896 1870 1871 dct_8x4_core_lsx1 vr0, vr1, vr2, vr3 1872 1873 vshuf4i.d vr1, vr1, 0x01 1874 vshuf4i.d vr3, vr3, 0x01 1875 1876 vilvl.h vr4, vr1, vr0 1877 vilvh.h vr5, vr1, vr0 1878 vilvl.h vr0, vr5, vr4 1879 vilvh.h vr1, vr5, vr4 1880 vilvl.h vr4, vr3, vr2 1881 vilvh.h vr5, vr3, vr2 1882 vilvl.h vr2, vr5, vr4 1883 vilvh.h vr3, vr5, vr4 1884 1885 la.local t0, iadst4_coeffs 1886 1887 vreplgr2vr.h vr23, zero 1888 vst vr23, a2, 0 1889 vst vr23, a2, 16 1890 vst vr23, a2, 32 1891 vst vr23, a2, 48 1892 1893 vldrepl.w vr20, t0, 0 // 1321 1894 vldrepl.w vr21, t0, 4 // 3803 1895 vldrepl.w vr22, t0, 8 // 2482 1896 vldrepl.w vr23, t0, 12 // 3344 1897 1898 vsllwil.w.h vr10, vr0, 0 // in0 1899 vexth.w.h vr11, vr0 // in1 1900 vsllwil.w.h vr12, vr1, 0 // in2 1901 vexth.w.h vr13, vr1 // in3 1902 adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 1903 1904 vsllwil.w.h vr14, vr2, 0 1905 vexth.w.h vr15, vr2 1906 vsllwil.w.h vr16, vr3, 0 1907 vexth.w.h vr17, vr3 1908 adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 1909 1910 vssrarni.h.w vr14, vr10, 12 1911 vssrarni.h.w vr15, vr11, 12 1912 vssrarni.h.w vr16, vr12, 12 1913 vssrarni.h.w vr17, vr13, 12 1914 vsrari.h vr14, vr14, 4 1915 vsrari.h vr15, vr15, 4 1916 vsrari.h vr16, vr16, 4 1917 vsrari.h vr17, vr17, 4 1918 1919 alsl.d t2, a1, a0, 1 1920 1921 VLD_DST_ADD_W8 vr17, vr16, vr15, vr14 1922endfunc 1923 1924function inv_txfm_add_flipadst_flipadst_8x4_8bpc_lsx 1925 vld vr0, a2, 0 // in0 1926 vld vr1, a2, 16 // in1 1927 vld vr2, a2, 32 // in2 1928 vld vr3, a2, 48 // in3 1929 1930 la.local t0, idct_coeffs 1931 vldrepl.w vr20, t0, 0 // 2896 1932 1933 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 1934 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 1935 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 1936 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 1937 1938 adst8x4_1d_lsx 1939 1940 vilvl.h vr20, vr15, vr13 1941 vilvl.h vr21, vr18, vr17 1942 vilvl.w vr0, vr21, vr20 1943 vilvh.w vr1, vr21, vr20 1944 vilvh.h vr20, vr15, vr13 1945 vilvh.h vr21, vr18, vr17 1946 vilvl.w vr2, vr21, vr20 1947 vilvh.w vr3, vr21, vr20 1948 vshuf4i.h vr0, vr0, 0x2d 1949 vshuf4i.h vr1, vr1, 0x2d 1950 vshuf4i.h vr2, vr2, 0x78 1951 vshuf4i.h vr3, vr3, 0x78 1952 1953 la.local t0, iadst4_coeffs 1954 1955 vreplgr2vr.h vr23, zero 1956 vst vr23, a2, 0 1957 vst vr23, a2, 16 1958 vst vr23, a2, 32 1959 vst vr23, a2, 48 1960 1961 vldrepl.w vr20, t0, 0 // 1321 1962 vldrepl.w vr21, t0, 4 // 3803 1963 vldrepl.w vr22, t0, 8 // 2482 1964 vldrepl.w vr23, t0, 12 // 3344 1965 1966 vsllwil.w.h vr10, vr2, 0 // in0 1967 vexth.w.h vr11, vr2 // in1 1968 vsllwil.w.h vr12, vr3, 0 // in2 1969 vexth.w.h vr13, vr3 // in3 1970 adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 1971 1972 vsllwil.w.h vr14, vr0, 0 1973 vexth.w.h vr15, vr0 1974 vsllwil.w.h vr16, vr1, 0 1975 vexth.w.h vr17, vr1 1976 adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 1977 1978 vssrarni.h.w vr14, vr10, 12 1979 vssrarni.h.w vr15, vr11, 12 1980 vssrarni.h.w vr16, vr12, 12 1981 vssrarni.h.w vr17, vr13, 12 1982 1983 vsrari.h vr14, vr14, 4 1984 vsrari.h vr15, vr15, 4 1985 vsrari.h vr16, vr16, 4 1986 vsrari.h vr17, vr17, 4 1987 1988 alsl.d t2, a1, a0, 1 1989 1990 VLD_DST_ADD_W8 vr17, vr16, vr15, vr14 1991endfunc 1992 1993function inv_txfm_add_dct_identity_8x4_8bpc_lsx 1994 vld vr0, a2, 0 // in0 1995 vld vr1, a2, 16 // in1 1996 vld vr2, a2, 32 // in2 1997 vld vr3, a2, 48 // in3 1998 1999 la.local t0, idct_coeffs 2000 vldrepl.w vr20, t0, 0 // 2896 2001 2002 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 2003 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 2004 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 2005 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 2006 2007 vldrepl.w vr20, t0, 8 // 1567 2008 vldrepl.w vr21, t0, 12 // 3784 2009 vldrepl.w vr22, t0, 0 // 2896 2010 2011 dct_8x4_core_lsx1 vr0, vr1, vr2, vr3 2012 2013 vshuf4i.d vr1, vr1, 0x01 2014 vshuf4i.d vr3, vr3, 0x01 2015 2016 vilvl.h vr4, vr1, vr0 2017 vilvh.h vr5, vr1, vr0 2018 vilvl.h vr0, vr5, vr4 2019 vilvh.h vr1, vr5, vr4 2020 vilvl.h vr4, vr3, vr2 2021 vilvh.h vr5, vr3, vr2 2022 vilvl.h vr2, vr5, vr4 2023 vilvh.h vr3, vr5, vr4 2024 vilvl.d vr14, vr2, vr0 2025 vilvh.d vr15, vr2, vr0 2026 vilvl.d vr16, vr3, vr1 2027 vilvh.d vr17, vr3, vr1 2028 2029 vreplgr2vr.h vr23, zero 2030 vst vr23, a2, 0 2031 vst vr23, a2, 16 2032 vst vr23, a2, 32 2033 vst vr23, a2, 48 2034 2035 li.w t0, 1697 2036 vreplgr2vr.w vr20, t0 2037 2038 identity_4x4_lsx vr14, vr14, vr20, vr14, vr14 2039 identity_4x4_lsx vr15, vr15, vr20, vr15, vr15 2040 identity_4x4_lsx vr16, vr16, vr20, vr16, vr16 2041 identity_4x4_lsx vr17, vr17, vr20, vr17, vr17 2042 2043 vsrari.h vr14, vr14, 4 2044 vsrari.h vr15, vr15, 4 2045 vsrari.h vr16, vr16, 4 2046 vsrari.h vr17, vr17, 4 2047 2048 alsl.d t2, a1, a0, 1 2049 2050 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 2051endfunc 2052 2053function inv_txfm_add_identity_dct_8x4_8bpc_lsx 2054 vld vr0, a2, 0 // in0 2055 vld vr1, a2, 16 // in1 2056 vld vr2, a2, 32 // in2 2057 vld vr3, a2, 48 // in3 2058 2059 la.local t0, idct_coeffs 2060 vldrepl.w vr20, t0, 0 // 2896 2061 2062 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 2063 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 2064 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 2065 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 2066 2067 identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \ 2068 vr19, vr7, vr9, vr11 2069 2070 vreplgr2vr.h vr23, zero 2071 vst vr23, a2, 0 2072 vst vr23, a2, 16 2073 vst vr23, a2, 32 2074 vst vr23, a2, 48 2075 2076 vilvl.h vr4, vr7, vr19 2077 vilvh.h vr5, vr7, vr19 2078 vilvl.h vr0, vr5, vr4 2079 vilvh.h vr1, vr5, vr4 2080 vilvl.h vr4, vr11, vr9 2081 vilvh.h vr5, vr11, vr9 2082 vilvl.h vr2, vr5, vr4 2083 vilvh.h vr3, vr5, vr4 2084 2085 la.local t0, idct_coeffs 2086 2087 vldrepl.w vr20, t0, 8 // 1567 2088 vldrepl.w vr21, t0, 12 // 3784 2089 vldrepl.w vr22, t0, 0 // 2896 2090 2091 dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \ 2092 vr22, vr15, vr16, vr17, vr18 2093 2094 vsrari.h vr15, vr15, 4 2095 vsrari.h vr16, vr16, 4 2096 vsrari.h vr17, vr17, 4 2097 vsrari.h vr18, vr18, 4 2098 2099 alsl.d t2, a1, a0, 1 2100 2101 VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 2102endfunc 2103 2104function inv_txfm_add_flipadst_identity_8x4_8bpc_lsx 2105 vld vr0, a2, 0 // in0 2106 vld vr1, a2, 16 // in1 2107 vld vr2, a2, 32 // in2 2108 vld vr3, a2, 48 // in3 2109 2110 la.local t0, idct_coeffs 2111 vldrepl.w vr20, t0, 0 // 2896 2112 2113 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 2114 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 2115 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 2116 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 2117 2118 adst8x4_1d_lsx 2119 2120 vilvl.h vr20, vr15, vr13 2121 vilvl.h vr21, vr18, vr17 2122 vilvl.w vr0, vr21, vr20 2123 vilvh.w vr1, vr21, vr20 2124 vilvh.h vr20, vr15, vr13 2125 vilvh.h vr21, vr18, vr17 2126 vilvl.w vr2, vr21, vr20 2127 vilvh.w vr3, vr21, vr20 2128 vshuf4i.h vr0, vr0, 0x2d 2129 vshuf4i.h vr1, vr1, 0x2d 2130 vshuf4i.h vr2, vr2, 0x78 2131 vshuf4i.h vr3, vr3, 0x78 2132 vilvl.d vr14, vr0, vr2 // in0 2133 vilvh.d vr15, vr0, vr2 // in1 2134 vilvl.d vr16, vr1, vr3 // in2 2135 vilvh.d vr17, vr1, vr3 // in3 2136 2137 vreplgr2vr.h vr23, zero 2138 vst vr23, a2, 0 2139 vst vr23, a2, 16 2140 vst vr23, a2, 32 2141 vst vr23, a2, 48 2142 2143 li.w t0, 1697 2144 vreplgr2vr.w vr20, t0 2145 2146 identity_4x4_lsx vr14, vr14, vr20, vr14, vr14 2147 identity_4x4_lsx vr15, vr15, vr20, vr15, vr15 2148 identity_4x4_lsx vr16, vr16, vr20, vr16, vr16 2149 identity_4x4_lsx vr17, vr17, vr20, vr17, vr17 2150 2151 vsrari.h vr14, vr14, 4 2152 vsrari.h vr15, vr15, 4 2153 vsrari.h vr16, vr16, 4 2154 vsrari.h vr17, vr17, 4 2155 2156 alsl.d t2, a1, a0, 1 2157 2158 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 2159endfunc 2160 2161function inv_txfm_add_identity_flipadst_8x4_8bpc_lsx 2162 vld vr0, a2, 0 // in0 2163 vld vr1, a2, 16 // in1 2164 vld vr2, a2, 32 // in2 2165 vld vr3, a2, 48 // in3 2166 2167 la.local t0, idct_coeffs 2168 vldrepl.w vr20, t0, 0 // 2896 2169 2170 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 2171 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 2172 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 2173 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 2174 2175 identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \ 2176 vr19, vr7, vr9, vr11 2177 2178 vreplgr2vr.h vr23, zero 2179 vst vr23, a2, 0 2180 vst vr23, a2, 16 2181 vst vr23, a2, 32 2182 vst vr23, a2, 48 2183 2184 vilvl.h vr4, vr7, vr19 2185 vilvh.h vr5, vr7, vr19 2186 vilvl.h vr0, vr5, vr4 2187 vilvh.h vr1, vr5, vr4 2188 vilvl.h vr4, vr11, vr9 2189 vilvh.h vr5, vr11, vr9 2190 vilvl.h vr2, vr5, vr4 2191 vilvh.h vr3, vr5, vr4 2192 2193 la.local t0, iadst4_coeffs 2194 2195 vreplgr2vr.h vr23, zero 2196 vst vr23, a2, 0 2197 vst vr23, a2, 16 2198 vst vr23, a2, 32 2199 vst vr23, a2, 48 2200 2201 vldrepl.w vr20, t0, 0 // 1321 2202 vldrepl.w vr21, t0, 4 // 3803 2203 vldrepl.w vr22, t0, 8 // 2482 2204 vldrepl.w vr23, t0, 12 // 3344 2205 2206 vsllwil.w.h vr10, vr0, 0 // in0 2207 vexth.w.h vr11, vr0 // in1 2208 vsllwil.w.h vr12, vr1, 0 // in2 2209 vexth.w.h vr13, vr1 // in3 2210 adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 2211 2212 vsllwil.w.h vr14, vr2, 0 2213 vexth.w.h vr15, vr2 2214 vsllwil.w.h vr16, vr3, 0 2215 vexth.w.h vr17, vr3 2216 adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 2217 2218 vssrarni.h.w vr14, vr10, 12 2219 vssrarni.h.w vr15, vr11, 12 2220 vssrarni.h.w vr16, vr12, 12 2221 vssrarni.h.w vr17, vr13, 12 2222 2223 vsrari.h vr14, vr14, 4 2224 vsrari.h vr15, vr15, 4 2225 vsrari.h vr16, vr16, 4 2226 vsrari.h vr17, vr17, 4 2227 2228 alsl.d t2, a1, a0, 1 2229 2230 VLD_DST_ADD_W8 vr17, vr16, vr15, vr14 2231endfunc 2232 2233function inv_txfm_add_adst_identity_8x4_8bpc_lsx 2234 vld vr0, a2, 0 // in0 2235 vld vr1, a2, 16 // in1 2236 vld vr2, a2, 32 // in2 2237 vld vr3, a2, 48 // in3 2238 2239 la.local t0, idct_coeffs 2240 vldrepl.w vr20, t0, 0 // 2896 2241 2242 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 2243 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 2244 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 2245 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 2246 2247 adst8x4_1d_lsx 2248 2249 vilvl.h vr4, vr17, vr13 2250 vilvl.h vr5, vr15, vr18 2251 vilvl.w vr14, vr5, vr4 // in0 in1 2252 vilvh.w vr16, vr5, vr4 // in2 in3 2253 vilvh.h vr4, vr18, vr15 2254 vilvh.h vr5, vr13, vr17 2255 vilvl.w vr17, vr5, vr4 2256 vilvh.w vr18, vr5, vr4 2257 vilvl.d vr10, vr17, vr14 // in0 2258 vilvh.d vr11, vr17, vr14 // in1 2259 vilvl.d vr12, vr18, vr16 // in2 2260 vilvh.d vr13, vr18, vr16 // in3 2261 2262 vreplgr2vr.h vr23, zero 2263 vst vr23, a2, 0 2264 vst vr23, a2, 16 2265 vst vr23, a2, 32 2266 vst vr23, a2, 48 2267 2268 li.w t0, 1697 2269 vreplgr2vr.w vr20, t0 2270 2271 identity_4x4_lsx vr10, vr10, vr20, vr10, vr15 2272 identity_4x4_lsx vr11, vr11, vr20, vr11, vr16 2273 identity_4x4_lsx vr12, vr12, vr20, vr12, vr17 2274 identity_4x4_lsx vr13, vr13, vr20, vr13, vr18 2275 2276 vsrari.h vr15, vr15, 4 2277 vsrari.h vr16, vr16, 4 2278 vsrari.h vr17, vr17, 4 2279 vsrari.h vr18, vr18, 4 2280 2281 alsl.d t2, a1, a0, 1 2282 2283 VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 2284endfunc 2285 2286function inv_txfm_add_identity_adst_8x4_8bpc_lsx 2287 vld vr0, a2, 0 // in0 2288 vld vr1, a2, 16 // in1 2289 vld vr2, a2, 32 // in2 2290 vld vr3, a2, 48 // in3 2291 2292 la.local t0, idct_coeffs 2293 vldrepl.w vr20, t0, 0 // 2896 2294 2295 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 2296 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 2297 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 2298 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 2299 2300 identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \ 2301 vr0, vr1, vr2, vr3 2302 2303 vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14 2304 vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15 2305 vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7 2306 vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15 2307 vilvl.h vr4, vr3, vr2 // 0 2 4 6 8 10 12 14 2308 vilvh.h vr5, vr3, vr2 // 1 3 5 7 9 11 13 15 2309 vilvl.h vr2, vr5, vr4 // 0 1 2 3 4 5 6 7 2310 vilvh.h vr3, vr5, vr4 // 8 9 10 11 12 13 14 15 2311 2312 vreplgr2vr.h vr23, zero 2313 vst vr23, a2, 0 2314 vst vr23, a2, 16 2315 vst vr23, a2, 32 2316 vst vr23, a2, 48 2317 2318 la.local t0, iadst4_coeffs 2319 2320 vldrepl.w vr20, t0, 0 // 1321 2321 vldrepl.w vr21, t0, 4 // 3803 2322 vldrepl.w vr22, t0, 8 // 2482 2323 vldrepl.w vr23, t0, 12 // 3344 2324 2325 vsllwil.w.h vr10, vr0, 0 2326 vexth.w.h vr11, vr0 2327 vsllwil.w.h vr12, vr1, 0 2328 vexth.w.h vr13, vr1 2329 2330 adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 2331 2332 vsllwil.w.h vr14, vr2, 0 2333 vexth.w.h vr15, vr2 2334 vsllwil.w.h vr16, vr3, 0 2335 vexth.w.h vr17, vr3 2336 2337 adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 2338 2339 vssrarni.h.w vr14, vr10, 12 2340 vssrarni.h.w vr15, vr11, 12 2341 vssrarni.h.w vr16, vr12, 12 2342 vssrarni.h.w vr17, vr13, 12 2343 2344 vsrari.h vr14, vr14, 4 2345 vsrari.h vr15, vr15, 4 2346 vsrari.h vr16, vr16, 4 2347 vsrari.h vr17, vr17, 4 2348 2349 alsl.d t2, a1, a0, 1 2350 2351 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 2352endfunc 2353 2354function inv_txfm_add_identity_identity_8x8_8bpc_lsx 2355 2356 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15 2357 2358 // identity8 2359 vsllwil.w.h vr6, vr0, 1 2360 vsllwil.w.h vr7, vr1, 1 2361 vsllwil.w.h vr8, vr2, 1 2362 vsllwil.w.h vr9, vr3, 1 2363 vsllwil.w.h vr10, vr4, 1 2364 vsllwil.w.h vr11, vr5, 1 2365 vsllwil.w.h vr12, vr14, 1 2366 vsllwil.w.h vr13, vr15, 1 2367 2368.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15 2369 vexth.w.h \i, \i 2370.endr 2371 2372.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15 2373 vslli.w \i, \i, 1 2374.endr 2375 2376 vssrarni.h.w vr0, vr6, 1 // in0 2377 vssrarni.h.w vr1, vr7, 1 // in1 2378 vssrarni.h.w vr2, vr8, 1 // in2 2379 vssrarni.h.w vr3, vr9, 1 // in3 2380 vssrarni.h.w vr4, vr10, 1 // in4 2381 vssrarni.h.w vr5, vr11, 1 // in5 2382 vssrarni.h.w vr14, vr12, 1 // in6 2383 vssrarni.h.w vr15, vr13, 1 // in7 2384 2385 vreplgr2vr.h vr23, zero 2386.irp i, 0, 16, 32, 48, 64, 80, 96, 112 2387 vst vr23, a2, \i 2388.endr 2389 2390 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15, \ 2391 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \ 2392 vr6, vr7, vr8, vr9, vr10, vr11, vr12 vr13 2393 2394 vsllwil.w.h vr6, vr16, 1 2395 vsllwil.w.h vr7, vr17, 1 2396 vsllwil.w.h vr8, vr18, 1 2397 vsllwil.w.h vr9, vr19, 1 2398 vsllwil.w.h vr10, vr20, 1 2399 vsllwil.w.h vr11, vr21, 1 2400 vsllwil.w.h vr12, vr22, 1 2401 vsllwil.w.h vr13, vr23, 1 2402 2403.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 2404 vexth.w.h \i, \i 2405.endr 2406 2407.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 2408 vslli.w \i, \i, 1 2409.endr 2410 2411 vssrarni.h.w vr16, vr6, 4 // in0 2412 vssrarni.h.w vr17, vr7, 4 // in1 2413 vssrarni.h.w vr18, vr8, 4 // in2 2414 vssrarni.h.w vr19, vr9, 4 // in3 2415 vssrarni.h.w vr20, vr10, 4 // in4 2416 vssrarni.h.w vr21, vr11, 4 // in5 2417 vssrarni.h.w vr22, vr12, 4 // in6 2418 vssrarni.h.w vr23, vr13, 4 // in7 2419 2420 alsl.d t2, a1, a0, 1 2421 2422 VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 2423 2424 alsl.d a0, a1, a0, 2 2425 alsl.d t2, a1, a0, 1 2426 2427 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 2428 2429endfunc 2430 2431.macro adst8x8_1d_lsx out0, out1, out2, out3 2432 la.local t0, iadst8_coeffs 2433 2434 vldrepl.w vr20, t0, 0 // 4076 2435 vldrepl.w vr21, t0, 4 // 401 2436 vldrepl.w vr22, t0, 8 // 3612 2437 vldrepl.w vr23, t0, 12 // 1931 2438 2439 // vr13 t0a t1a vr15 t2a t3a 2440 vmadd_vmsub_vssrarni_hw_12 vr11, vr18, vr9, vr6, vr20, vr21, vr21, vr20, \ 2441 vr22, vr23, vr23, vr22, vr12, vr13, vr14, vr15 2442 vldrepl.w vr20, t0, 16 // 2598 2443 vldrepl.w vr21, t0, 20 // 3166 2444 vldrepl.w vr22, t0, 24 // 1189 2445 vldrepl.w vr23, t0, 28 // 3920 2446 2447 // vr18 t4a t5a vr6 t6a t7a 2448 vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr19, vr10, vr20, vr21, vr21, vr20, \ 2449 vr22, vr23, vr23, vr22, vr11, vr18, vr9, vr6 2450 2451 vsadd.h vr12, vr13, vr18 // t0 t1 2452 vsadd.h vr14, vr15, vr6 // t2 t3 2453 vssub.h vr9, vr13, vr18 // t4 t5 2454 vssub.h vr18, vr15, vr6 // t6 t7 2455 2456 la.local t0, idct_coeffs 2457 2458 vldrepl.w vr20, t0, 8 // 1567 2459 vldrepl.w vr21, t0, 12 // 3784 2460 vldrepl.w vr22, t0, 0 // 2896 2461 2462 vsllwil.w.h vr7, vr9, 0 // t4 2463 vexth.w.h vr8, vr9 // t5 2464 vsllwil.w.h vr10, vr18, 0 // t6 2465 vexth.w.h vr11, vr18 // t7 2466 2467 // vr13 out0 out7 vr17 out1 out6 2468 vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr11, vr10, vr21, vr20, vr20, vr21, \ 2469 vr20, vr21, vr21, vr20, vr13, vr15, vr18, vr19 2470 vshuf4i.d vr19, vr19, 0x01 2471 2472 vsadd.h vr13, vr12, vr14 // out0 out7 2473 vssub.h vr6, vr12, vr14 // t2 t3 2474 vsadd.h vr7, vr15, vr19 // out1 out6 2475 vssub.h vr18, vr15, vr19 // t6 t7 2476 2477 vexth.w.h vr20, vr13 // out7 2478 vsllwil.w.h vr21, vr7, 0 // out1 2479 vneg.w vr20, vr20 2480 vneg.w vr21, vr21 2481 vssrarni.h.w vr21, vr20, 0 // out7 out1 2482 vilvl.d \out0, vr21, vr13 // out0 out7 2483 vilvh.d \out1, vr7, vr21 // out1 out6 2484 2485 vsllwil.w.h vr7, vr6, 0 // t2 2486 vexth.w.h vr8, vr6 // t3 2487 vsllwil.w.h vr10, vr18, 0 // t6 2488 vexth.w.h vr11, vr18 // t7 2489 2490 // vr15 out[3] out[4] vr18 out[2] out[5] 2491 vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr10, vr11, vr22, vr22, vr22, vr22, \ 2492 vr22, vr22, vr22, vr22, vr14, vr15, vr19, vr18 2493 2494 vexth.w.h vr20, vr18 // out5 2495 vsllwil.w.h vr21, vr15, 0 // out3 2496 vneg.w vr20, vr20 2497 vneg.w vr21, vr21 2498 vssrarni.h.w vr21, vr20, 0 // out5 out3 2499 vilvl.d \out2, vr21, vr18 // out2 out5 2500 vilvh.d \out3, vr15, vr21 // out3 out4 2501.endm 2502 2503function inv_txfm_add_adst_dct_8x8_8bpc_lsx 2504 addi.d sp, sp, -32 2505 fst.d f24, sp, 0 2506 fst.d f25, sp, 8 2507 fst.d f26, sp, 16 2508 fst.d f27, sp, 24 2509 2510 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 2511 2512 vsllwil.w.h vr18, vr0, 0 2513 vsllwil.w.h vr19, vr1, 0 2514 vsllwil.w.h vr6, vr2, 0 2515 vsllwil.w.h vr7, vr3, 0 2516 vsllwil.w.h vr8, vr4, 0 2517 vsllwil.w.h vr9, vr5, 0 2518 vsllwil.w.h vr10, vr16, 0 2519 vsllwil.w.h vr11, vr17, 0 2520 adst8x8_1d_lsx vr24, vr25, vr26, vr27 2521 2522 vexth.w.h vr18, vr0 2523 vexth.w.h vr19, vr1 2524 vexth.w.h vr6, vr2 2525 vexth.w.h vr7, vr3 2526 vexth.w.h vr8, vr4 2527 vexth.w.h vr9, vr5 2528 vexth.w.h vr10, vr16 2529 vexth.w.h vr11, vr17 2530 adst8x8_1d_lsx vr0, vr1, vr2, vr3 2531 2532 vreplgr2vr.h vr23, zero 2533.irp i, 0, 16, 32, 48, 64, 80, 96, 112 2534 vst vr23, a2, \i 2535.endr 2536 2537.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 2538 vsrari.h \i, \i, 1 2539.endr 2540 2541 LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ 2542 vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25, \ 2543 vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 2544 2545 vshuf4i.h vr14, vr14, 0x1b 2546 vshuf4i.h vr15, vr15, 0x1b 2547 vshuf4i.h vr24, vr24, 0x1b 2548 vshuf4i.h vr25, vr25, 0x1b 2549 2550 vsllwil.w.h vr18, vr4, 0 2551 vsllwil.w.h vr19, vr5, 0 2552 vsllwil.w.h vr6, vr12, 0 2553 vsllwil.w.h vr7, vr13, 0 2554 vexth.w.h vr8, vr4 2555 vexth.w.h vr9, vr5 2556 vexth.w.h vr10, vr12 2557 vexth.w.h vr11, vr13 2558 2559 la.local t0, idct_coeffs 2560 vldrepl.w vr20, t0, 8 // 1567 2561 vldrepl.w vr21, t0, 12 // 3784 2562 vldrepl.w vr22, t0, 0 // 2896 2563 2564 dct_8x4_core_lsx1 vr4, vr5, vr12, vr13 2565 2566 vshuf4i.d vr5, vr5, 0x01 2567 vshuf4i.d vr13, vr13, 0x01 2568 2569 vsllwil.w.h vr18, vr14, 0 2570 vsllwil.w.h vr19, vr15, 0 2571 vsllwil.w.h vr6, vr24, 0 2572 vsllwil.w.h vr7, vr25, 0 2573 vexth.w.h vr8, vr14 2574 vexth.w.h vr9, vr15 2575 vexth.w.h vr10, vr24 2576 vexth.w.h vr11, vr25 2577 2578 la.local t0, idct_coeffs 2579 vldrepl.w vr20, t0, 8 // 1567 2580 vldrepl.w vr21, t0, 12 // 3784 2581 vldrepl.w vr22, t0, 0 // 2896 2582 2583 dct_8x4_core_lsx1 vr14, vr15, vr24, vr25 2584 2585 vshuf4i.d vr15, vr15, 0x01 2586 vshuf4i.d vr25, vr25, 0x01 2587 2588 vilvl.d vr20, vr14, vr4 2589 vilvh.d vr21, vr14, vr4 2590 vilvl.d vr22, vr15, vr5 2591 vilvh.d vr23, vr15, vr5 2592 vilvl.d vr16, vr24, vr12 2593 vilvh.d vr17, vr24, vr12 2594 vilvl.d vr18, vr25, vr13 2595 vilvh.d vr19, vr25, vr13 2596 2597.irp i, vr20, vr21, vr22, vr23, vr16, vr17, vr18, vr19 2598 vsrari.h \i, \i, 4 2599.endr 2600 2601 alsl.d t2, a1, a0, 1 2602 2603 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 2604 2605 alsl.d a0, a1, a0, 2 2606 alsl.d t2, a1, a0, 1 2607 2608 VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 2609 2610 fld.d f24, sp, 0 2611 fld.d f25, sp, 8 2612 fld.d f26, sp, 16 2613 fld.d f27, sp, 24 2614 addi.d sp, sp, 32 2615endfunc 2616 2617function inv_txfm_add_dct_adst_8x8_8bpc_lsx 2618 addi.d sp, sp, -48 2619 fst.d f24, sp, 0 2620 fst.d f25, sp, 8 2621 fst.d f26, sp, 16 2622 fst.d f27, sp, 24 2623 fst.d f28, sp, 32 2624 fst.d f29, sp, 40 2625 2626 vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25 2627 2628 la.local t0, idct_coeffs 2629 vldrepl.w vr20, t0, 8 // 1567 2630 vldrepl.w vr21, t0, 12 // 3784 2631 vldrepl.w vr22, t0, 0 // 2896 2632 2633 vsllwil.w.h vr18, vr4, 0 2634 vsllwil.w.h vr19, vr5, 0 2635 vsllwil.w.h vr6, vr12, 0 2636 vsllwil.w.h vr7, vr13, 0 2637 vsllwil.w.h vr8, vr14, 0 2638 vsllwil.w.h vr9, vr15, 0 2639 vsllwil.w.h vr10, vr24, 0 2640 vsllwil.w.h vr11, vr25, 0 2641 2642 dct_8x4_core_lsx1 vr26, vr27, vr28, vr29 2643 2644 vshuf4i.d vr27, vr27, 0x01 2645 vshuf4i.d vr29, vr29, 0x01 2646 2647 vilvl.h vr8, vr27, vr26 // 0 2 4 6 8 10 12 14 2648 vilvh.h vr9, vr27, vr26 // 1 3 5 7 9 11 13 15 2649 vilvl.h vr26, vr9, vr8 // 0 - 7 in0 2650 vilvh.h vr27, vr9, vr8 // 8 - 15 in1 2651 vilvl.h vr8, vr29, vr28 // 0 2 4 6 8 10 12 14 2652 vilvh.h vr9, vr29, vr28 // 1 3 5 7 9 11 13 15 2653 vilvl.h vr28, vr9, vr8 // 16 - 23 in2 2654 vilvh.h vr29, vr9, vr8 // 24 - 31 in3 2655 2656 vsrari.h vr26, vr26, 1 // in0low in1low 2657 vsrari.h vr27, vr27, 1 // in2low in3low 2658 vsrari.h vr28, vr28, 1 // in0high in1high 2659 vsrari.h vr29, vr29, 1 // in2high in3high 2660 2661 vexth.w.h vr18, vr4 2662 vexth.w.h vr19, vr5 2663 vexth.w.h vr6, vr12 2664 vexth.w.h vr7, vr13 2665 vexth.w.h vr8, vr14 2666 vexth.w.h vr9, vr15 2667 vexth.w.h vr10, vr24 2668 vexth.w.h vr11, vr25 2669 2670 la.local t0, idct_coeffs 2671 vldrepl.w vr20, t0, 8 // 1567 2672 vldrepl.w vr21, t0, 12 // 3784 2673 vldrepl.w vr22, t0, 0 // 2896 2674 2675 dct_8x4_core_lsx1 vr12, vr13, vr14, vr15 2676 2677 vshuf4i.d vr13, vr13, 0x01 2678 vshuf4i.d vr15, vr15, 0x01 2679 2680 vilvl.h vr8, vr13, vr12 // 0 2 4 6 8 10 12 14 2681 vilvh.h vr9, vr13, vr12 // 1 3 5 7 9 11 13 15 2682 vilvl.h vr12, vr9, vr8 // 0 - 7 in0 2683 vilvh.h vr13, vr9, vr8 // 8 - 15 in1 2684 vilvl.h vr8, vr15, vr14 // 0 2 4 6 8 10 12 14 2685 vilvh.h vr9, vr15, vr14 // 1 3 5 7 9 11 13 15 2686 vilvl.h vr14, vr9, vr8 // 16 - 23 in2 2687 vilvh.h vr15, vr9, vr8 // 24 - 31 in3 2688 2689 vsrari.h vr0, vr12, 1 // in4low in5low 2690 vsrari.h vr1, vr13, 1 // in6low in7low 2691 vsrari.h vr2, vr14, 1 // in4high in5high 2692 vsrari.h vr3, vr15, 1 // in6high in7high 2693 2694 vreplgr2vr.h vr23, zero 2695.irp i, 0, 16, 32, 48, 64, 80, 96, 112 2696 vst vr23, a2, \i 2697.endr 2698 2699 vsllwil.w.h vr18, vr26, 0 // in0 2700 vexth.w.h vr19, vr26 // in1 2701 vsllwil.w.h vr6, vr27, 0 // in2 2702 vexth.w.h vr7, vr27 // in3 2703 vsllwil.w.h vr8, vr0, 0 // in3 2704 vexth.w.h vr9, vr0 // in4 2705 vsllwil.w.h vr10, vr1, 0 // in5 2706 vexth.w.h vr11, vr1 // in6 2707 adst8x8_1d_lsx vr26, vr27, vr0, vr1 2708 2709 vsllwil.w.h vr18, vr28, 0 // in0 2710 vexth.w.h vr19, vr28 // in1 2711 vsllwil.w.h vr6, vr29, 0 // in2 2712 vexth.w.h vr7, vr29 // in3 2713 vsllwil.w.h vr8, vr2, 0 // in4 2714 vexth.w.h vr9, vr2 // in5 2715 vsllwil.w.h vr10, vr3, 0 // in6 2716 vexth.w.h vr11, vr3 // in7 2717 adst8x8_1d_lsx vr28, vr29, vr16, vr17 2718 2719 vilvl.d vr4, vr28, vr26 // 0 ... 7 2720 vilvl.d vr5, vr29, vr27 // 8 ... 15 2721 vilvl.d vr6, vr16, vr0 // 16 ... 23 2722 vilvl.d vr7, vr17, vr1 // 24 ... 31 2723 vilvh.d vr14, vr17, vr1 // 32 ... 39 2724 vilvh.d vr15, vr16, vr0 // 40 ... 47 2725 vilvh.d vr16, vr29, vr27 // 48 ... 55 2726 vilvh.d vr17, vr28, vr26 // 56 ... 63 2727 2728.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17 2729 vsrari.h \i, \i, 4 2730.endr 2731 2732 alsl.d t2, a1, a0, 1 2733 2734 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 2735 2736 alsl.d a0, a1, a0, 2 2737 alsl.d t2, a1, a0, 1 2738 2739 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 2740 2741 fld.d f24, sp, 0 2742 fld.d f25, sp, 8 2743 fld.d f26, sp, 16 2744 fld.d f27, sp, 24 2745 fld.d f28, sp, 32 2746 fld.d f29, sp, 40 2747 addi.d sp, sp, 48 2748endfunc 2749 2750function inv_txfm_add_adst_adst_8x8_8bpc_lsx 2751 addi.d sp, sp, -32 2752 fst.d f24, sp, 0 2753 fst.d f25, sp, 8 2754 fst.d f26, sp, 16 2755 fst.d f27, sp, 24 2756 2757 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 2758 2759 vsllwil.w.h vr18, vr0, 0 2760 vsllwil.w.h vr19, vr1, 0 2761 vsllwil.w.h vr6, vr2, 0 2762 vsllwil.w.h vr7, vr3, 0 2763 vsllwil.w.h vr8, vr4, 0 2764 vsllwil.w.h vr9, vr5, 0 2765 vsllwil.w.h vr10, vr16, 0 2766 vsllwil.w.h vr11, vr17, 0 2767 adst8x8_1d_lsx vr24, vr25, vr26, vr27 2768 2769 vexth.w.h vr18, vr0 // in0 2770 vexth.w.h vr19, vr1 // in1 2771 vexth.w.h vr6, vr2 // in2 2772 vexth.w.h vr7, vr3 // in3 2773 vexth.w.h vr8, vr4 // in3 2774 vexth.w.h vr9, vr5 // in4 2775 vexth.w.h vr10, vr16 // in5 2776 vexth.w.h vr11, vr17 // in6 2777 adst8x8_1d_lsx vr0, vr1, vr2, vr3 2778 2779 vreplgr2vr.h vr23, zero 2780.irp i, 0, 16, 32, 48, 64, 80, 96, 112 2781 vst vr23, a2, \i 2782.endr 2783 2784.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 2785 vsrari.h \i, \i, 1 2786.endr 2787 2788 LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ 2789 vr14, vr15, vr12, vr13, vr4, vr5, vr24, vr25, \ 2790 vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 2791 2792 vshuf4i.h vr4, vr4, 0x1b 2793 vshuf4i.h vr5, vr5, 0x1b 2794 vshuf4i.h vr24, vr24, 0x1b 2795 vshuf4i.h vr25, vr25, 0x1b 2796 2797 vsllwil.w.h vr18, vr14, 0 2798 vsllwil.w.h vr19, vr15, 0 2799 vsllwil.w.h vr6, vr12, 0 2800 vsllwil.w.h vr7, vr13, 0 2801 vexth.w.h vr8, vr14 // in3 2802 vexth.w.h vr9, vr15 // in4 2803 vexth.w.h vr10, vr12 // in5 2804 vexth.w.h vr11, vr13 // in6 2805 2806 adst8x8_1d_lsx vr26, vr27, vr0, vr1 2807 2808 vsllwil.w.h vr18, vr4, 0 2809 vsllwil.w.h vr19, vr5, 0 2810 vsllwil.w.h vr6, vr24, 0 2811 vsllwil.w.h vr7, vr25, 0 2812 vexth.w.h vr8, vr4 // in3 2813 vexth.w.h vr9, vr5 // in4 2814 vexth.w.h vr10, vr24 // in5 2815 vexth.w.h vr11, vr25 // in6 2816 2817 adst8x8_1d_lsx vr24, vr25, vr16, vr17 2818 2819 vilvl.d vr4, vr24, vr26 // 0 ... 7 2820 vilvl.d vr5, vr25, vr27 // 8 ... 15 2821 vilvl.d vr6, vr16, vr0 // 16 ... 23 2822 vilvl.d vr7, vr17, vr1 // 24 ... 31 2823 vilvh.d vr14, vr17, vr1 // 32 ... 39 2824 vilvh.d vr15, vr16, vr0 // 40 ... 47 2825 vilvh.d vr16, vr25, vr27 // 48 ... 55 2826 vilvh.d vr17, vr24, vr26 // 56 ... 63 2827 2828.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17 2829 vsrari.h \i, \i, 4 2830.endr 2831 2832 alsl.d t2, a1, a0, 1 2833 2834 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 2835 2836 alsl.d a0, a1, a0, 2 2837 alsl.d t2, a1, a0, 1 2838 2839 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 2840 2841 fld.d f24, sp, 0 2842 fld.d f25, sp, 8 2843 fld.d f26, sp, 16 2844 fld.d f27, sp, 24 2845 addi.d sp, sp, 32 2846endfunc 2847 2848function inv_txfm_add_flipadst_adst_8x8_8bpc_lsx 2849 addi.d sp, sp, -32 2850 fst.d f24, sp, 0 2851 fst.d f25, sp, 8 2852 fst.d f26, sp, 16 2853 fst.d f27, sp, 24 2854 2855 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 2856 2857 vsllwil.w.h vr18, vr0, 0 2858 vsllwil.w.h vr19, vr1, 0 2859 vsllwil.w.h vr6, vr2, 0 2860 vsllwil.w.h vr7, vr3, 0 2861 vsllwil.w.h vr8, vr4, 0 2862 vsllwil.w.h vr9, vr5, 0 2863 vsllwil.w.h vr10, vr16, 0 2864 vsllwil.w.h vr11, vr17, 0 2865 adst8x8_1d_lsx vr12, vr13, vr14, vr15 2866 2867 vilvl.h vr20, vr12, vr13 2868 vilvl.h vr21, vr14, vr15 2869 vilvl.w vr24, vr20, vr21 2870 vilvh.w vr25, vr20, vr21 2871 vilvh.h vr20, vr12, vr13 2872 vilvh.h vr21, vr14, vr15 2873 vilvl.w vr26, vr20, vr21 2874 vilvh.w vr27, vr20, vr21 2875 vshuf4i.h vr26, vr26, 0x1b 2876 vshuf4i.h vr27, vr27, 0x1b 2877 2878 vexth.w.h vr18, vr0 2879 vexth.w.h vr19, vr1 2880 vexth.w.h vr6, vr2 2881 vexth.w.h vr7, vr3 2882 vexth.w.h vr8, vr4 2883 vexth.w.h vr9, vr5 2884 vexth.w.h vr10, vr16 2885 vexth.w.h vr11, vr17 2886 adst8x8_1d_lsx vr12, vr13, vr14, vr15 2887 2888 vilvl.h vr20, vr12, vr13 2889 vilvl.h vr21, vr14, vr15 2890 vilvl.w vr0, vr20, vr21 2891 vilvh.w vr1, vr20, vr21 2892 vilvh.h vr20, vr12, vr13 2893 vilvh.h vr21, vr14, vr15 2894 vilvl.w vr2, vr20, vr21 2895 vilvh.w vr3, vr20, vr21 2896 vshuf4i.h vr2, vr2, 0x1b 2897 vshuf4i.h vr3, vr3, 0x1b 2898 2899 vreplgr2vr.h vr23, zero 2900.irp i, 0, 16, 32, 48, 64, 80, 96, 112 2901 vst vr23, a2, \i 2902.endr 2903 2904.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 2905 vsrari.h \i, \i, 1 2906.endr 2907 2908 vsllwil.w.h vr18, vr26, 0 // in0 2909 vexth.w.h vr19, vr26 // in1 2910 vsllwil.w.h vr6, vr27, 0 // in2 2911 vexth.w.h vr7, vr27 // in3 2912 vsllwil.w.h vr8, vr2, 0 // in4 2913 vexth.w.h vr9, vr2 // in5 2914 vsllwil.w.h vr10, vr3, 0 // in6 2915 vexth.w.h vr11, vr3 // in7 2916 adst8x8_1d_lsx vr4, vr5, vr16, vr17 2917 2918 vsllwil.w.h vr18, vr24, 0 // in0 2919 vexth.w.h vr19, vr24 // in1 2920 vsllwil.w.h vr6, vr25, 0 // in2 2921 vexth.w.h vr7, vr25 // in3 2922 vsllwil.w.h vr8, vr0, 0 // in4 2923 vexth.w.h vr9, vr0 // in5 2924 vsllwil.w.h vr10, vr1, 0 // in6 2925 vexth.w.h vr11, vr1 // in7 2926 adst8x8_1d_lsx vr0, vr1, vr2, vr3 2927 2928 vilvl.d vr20, vr0, vr4 // 0 ... 7 2929 vilvl.d vr21, vr1, vr5 // 8 ... 15 2930 vilvl.d vr22, vr2, vr16 // 16 ... 23 2931 vilvl.d vr23, vr3, vr17 // 24 ... 31 2932 vilvh.d vr14, vr3, vr17 // 32 ... 39 2933 vilvh.d vr15, vr2, vr16 // 40 ... 47 2934 vilvh.d vr16, vr1, vr5 // 48 ... 55 2935 vilvh.d vr17, vr0, vr4 // 56 ... 63 2936 2937.irp i, vr20, vr21, vr22, vr23, vr14, vr15, vr16, vr17 2938 vsrari.h \i, \i, 4 2939.endr 2940 2941 alsl.d t2, a1, a0, 1 2942 2943 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 2944 2945 alsl.d a0, a1, a0, 2 2946 alsl.d t2, a1, a0, 1 2947 2948 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 2949 fld.d f24, sp, 0 2950 fld.d f25, sp, 8 2951 fld.d f26, sp, 16 2952 fld.d f27, sp, 24 2953 addi.d sp, sp, 32 2954endfunc 2955 2956function inv_txfm_add_adst_flipadst_8x8_8bpc_lsx 2957 addi.d sp, sp, -32 2958 fst.d f24, sp, 0 2959 fst.d f25, sp, 8 2960 fst.d f26, sp, 16 2961 fst.d f27, sp, 24 2962 2963 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 2964 2965 vsllwil.w.h vr18, vr0, 0 2966 vsllwil.w.h vr19, vr1, 0 2967 vsllwil.w.h vr6, vr2, 0 2968 vsllwil.w.h vr7, vr3, 0 2969 vsllwil.w.h vr8, vr4, 0 2970 vsllwil.w.h vr9, vr5, 0 2971 vsllwil.w.h vr10, vr16, 0 2972 vsllwil.w.h vr11, vr17, 0 2973 adst8x8_1d_lsx vr24, vr25, vr26, vr27 2974 2975 vexth.w.h vr18, vr0 2976 vexth.w.h vr19, vr1 2977 vexth.w.h vr6, vr2 2978 vexth.w.h vr7, vr3 2979 vexth.w.h vr8, vr4 2980 vexth.w.h vr9, vr5 2981 vexth.w.h vr10, vr16 2982 vexth.w.h vr11, vr17 2983 adst8x8_1d_lsx vr0, vr1, vr2, vr3 2984 2985 vreplgr2vr.h vr23, zero 2986.irp i, 0, 16, 32, 48, 64, 80, 96, 112 2987 vst vr23, a2, \i 2988.endr 2989 2990.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 2991 vsrari.h \i, \i, 1 2992.endr 2993 2994 LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ 2995 vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ 2996 vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 2997 2998 vshuf4i.h vr0, vr0, 0x1b 2999 vshuf4i.h vr1, vr1, 0x1b 3000 vshuf4i.h vr2, vr2, 0x1b 3001 vshuf4i.h vr3, vr3, 0x1b 3002 3003 vsllwil.w.h vr18, vr0, 0 // in0 3004 vsllwil.w.h vr19, vr1, 0 // in1 3005 vsllwil.w.h vr6, vr2, 0 // in2 3006 vsllwil.w.h vr7, vr3, 0 // in3 3007 vexth.w.h vr8, vr0 // in4 3008 vexth.w.h vr9, vr1 // in5 3009 vexth.w.h vr10, vr2 // in6 3010 vexth.w.h vr11, vr3 // in7 3011 adst8x8_1d_lsx vr4, vr5, vr16, vr17 3012 3013 vsllwil.w.h vr18, vr24, 0 // in0 3014 vsllwil.w.h vr19, vr25, 0 // in1 3015 vsllwil.w.h vr6, vr26, 0 // in2 3016 vsllwil.w.h vr7, vr27, 0 // in3 3017 vexth.w.h vr8, vr24 // in4 3018 vexth.w.h vr9, vr25 // in5 3019 vexth.w.h vr10, vr26 // in6 3020 vexth.w.h vr11, vr27 // in7 3021 adst8x8_1d_lsx vr0, vr1, vr2, vr3 3022 3023 vilvh.d vr20, vr4, vr0 3024 vilvh.d vr21, vr5, vr1 3025 vilvh.d vr22, vr16, vr2 3026 vilvh.d vr23, vr17, vr3 3027 vilvl.d vr14, vr17, vr3 3028 vilvl.d vr15, vr16, vr2 3029 vilvl.d vr18, vr5, vr1 3030 vilvl.d vr19, vr4, vr0 3031 3032.irp i, vr20, vr21, vr22, vr23, vr14, vr15, vr18, vr19 3033 vsrari.h \i, \i, 4 3034.endr 3035 3036 alsl.d t2, a1, a0, 1 3037 3038 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 3039 3040 alsl.d a0, a1, a0, 2 3041 alsl.d t2, a1, a0, 1 3042 3043 VLD_DST_ADD_W8 vr14, vr15, vr18, vr19 3044 3045 fld.d f24, sp, 0 3046 fld.d f25, sp, 8 3047 fld.d f26, sp, 16 3048 fld.d f27, sp, 24 3049 addi.d sp, sp, 32 3050endfunc 3051 3052function inv_txfm_add_flipadst_dct_8x8_8bpc_lsx 3053 addi.d sp, sp, -32 3054 fst.d f24, sp, 0 3055 fst.d f25, sp, 8 3056 fst.d f26, sp, 16 3057 fst.d f27, sp, 24 3058 3059 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 3060 3061 vsllwil.w.h vr18, vr0, 0 3062 vsllwil.w.h vr19, vr1, 0 3063 vsllwil.w.h vr6, vr2, 0 3064 vsllwil.w.h vr7, vr3, 0 3065 vsllwil.w.h vr8, vr4, 0 3066 vsllwil.w.h vr9, vr5, 0 3067 vsllwil.w.h vr10, vr16, 0 3068 vsllwil.w.h vr11, vr17, 0 3069 adst8x8_1d_lsx vr12, vr13, vr14, vr15 3070 3071 vilvl.h vr20, vr12, vr13 3072 vilvl.h vr21, vr14, vr15 3073 vilvl.w vr24, vr20, vr21 3074 vilvh.w vr25, vr20, vr21 3075 vilvh.h vr20, vr12, vr13 3076 vilvh.h vr21, vr14, vr15 3077 vilvl.w vr26, vr20, vr21 3078 vilvh.w vr27, vr20, vr21 3079 3080 vexth.w.h vr18, vr0 3081 vexth.w.h vr19, vr1 3082 vexth.w.h vr6, vr2 3083 vexth.w.h vr7, vr3 3084 vexth.w.h vr8, vr4 3085 vexth.w.h vr9, vr5 3086 vexth.w.h vr10, vr16 3087 vexth.w.h vr11, vr17 3088 adst8x8_1d_lsx vr12, vr13, vr14, vr15 3089 3090 vilvl.h vr20, vr12, vr13 3091 vilvl.h vr21, vr14, vr15 3092 vilvl.w vr0, vr20, vr21 3093 vilvh.w vr1, vr20, vr21 3094 vilvh.h vr20, vr12, vr13 3095 vilvh.h vr21, vr14, vr15 3096 vilvl.w vr2, vr20, vr21 3097 vilvh.w vr3, vr20, vr21 3098 3099 vreplgr2vr.h vr23, zero 3100 3101.irp i, 0, 16, 32, 48, 64, 80, 96, 112 3102 vst vr23, a2, \i 3103.endr 3104 3105 vsrari.h vr24, vr24, 1 3106 vsrari.h vr25, vr25, 1 3107 vsrari.h vr26, vr26, 1 3108 vsrari.h vr27, vr27, 1 3109 vsrari.h vr14, vr0, 1 3110 vsrari.h vr15, vr1, 1 3111 vsrari.h vr16, vr2, 1 3112 vsrari.h vr17, vr3, 1 3113 3114 vsllwil.w.h vr18, vr26, 0 3115 vexth.w.h vr19, vr26 3116 vsllwil.w.h vr6, vr27, 0 3117 vexth.w.h vr7, vr27 3118 vsllwil.w.h vr8, vr16, 0 3119 vexth.w.h vr9, vr16 3120 vsllwil.w.h vr10, vr17, 0 3121 vexth.w.h vr11, vr17 3122 3123 la.local t0, idct_coeffs 3124 vldrepl.w vr20, t0, 8 // 1567 3125 vldrepl.w vr21, t0, 12 // 3784 3126 vldrepl.w vr22, t0, 0 // 2896 3127 3128 dct_8x4_core_lsx1 vr26, vr27, vr16, vr17 3129 3130 vshuf4i.h vr26, vr26, 0x1b 3131 vshuf4i.h vr27, vr27, 0x1b 3132 vshuf4i.h vr16, vr16, 0x1b 3133 vshuf4i.h vr17, vr17, 0x1b 3134 3135 vsllwil.w.h vr18, vr24, 0 3136 vexth.w.h vr19, vr24 3137 vsllwil.w.h vr6, vr25, 0 3138 vexth.w.h vr7, vr25 3139 vsllwil.w.h vr8, vr14, 0 3140 vexth.w.h vr9, vr14 3141 vsllwil.w.h vr10, vr15, 0 3142 vexth.w.h vr11, vr15 3143 3144 la.local t0, idct_coeffs 3145 vldrepl.w vr20, t0, 8 // 1567 3146 vldrepl.w vr21, t0, 12 // 3784 3147 vldrepl.w vr22, t0, 0 // 2896 3148 3149 dct_8x4_core_lsx1 vr24, vr25, vr14, vr15 3150 3151 vilvl.d vr4, vr24, vr26 3152 vilvh.d vr5, vr24, vr26 3153 vilvh.d vr6, vr25, vr27 3154 vilvl.d vr7, vr25, vr27 3155 vilvl.d vr24, vr14, vr16 3156 vilvh.d vr25, vr14, vr16 3157 vilvh.d vr26, vr15, vr17 3158 vilvl.d vr27, vr15, vr17 3159 3160.irp i, vr4, vr5, vr6, vr7, vr24, vr25, vr26, vr27 3161 vsrari.h \i, \i, 4 3162.endr 3163 3164 alsl.d t2, a1, a0, 1 3165 3166 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 3167 3168 alsl.d a0, a1, a0, 2 3169 alsl.d t2, a1, a0, 1 3170 3171 VLD_DST_ADD_W8 vr24, vr25, vr26, vr27 3172 3173 fld.d f24, sp, 0 3174 fld.d f25, sp, 8 3175 fld.d f26, sp, 16 3176 fld.d f27, sp, 24 3177 addi.d sp, sp, 32 3178endfunc 3179 3180function inv_txfm_add_dct_flipadst_8x8_8bpc_lsx 3181 addi.d sp, sp, -48 3182 fst.d f24, sp, 0 3183 fst.d f25, sp, 8 3184 fst.d f26, sp, 16 3185 fst.d f27, sp, 24 3186 fst.d f28, sp, 32 3187 fst.d f29, sp, 40 3188 3189 vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25 3190 3191 la.local t0, idct_coeffs 3192 vldrepl.w vr20, t0, 8 // 1567 3193 vldrepl.w vr21, t0, 12 // 3784 3194 vldrepl.w vr22, t0, 0 // 2896 3195 3196 vsllwil.w.h vr18, vr4, 0 3197 vsllwil.w.h vr19, vr5, 0 3198 vsllwil.w.h vr6, vr12, 0 3199 vsllwil.w.h vr7, vr13, 0 3200 vsllwil.w.h vr8, vr14, 0 3201 vsllwil.w.h vr9, vr15, 0 3202 vsllwil.w.h vr10, vr24, 0 3203 vsllwil.w.h vr11, vr25, 0 3204 dct_8x4_core_lsx1 vr26, vr27, vr28, vr29 3205 vshuf4i.d vr27, vr27, 0x01 3206 vshuf4i.d vr29, vr29, 0x01 3207 3208 vilvl.h vr8, vr27, vr26 3209 vilvh.h vr9, vr27, vr26 3210 vilvl.h vr26, vr9, vr8 3211 vilvh.h vr27, vr9, vr8 3212 vilvl.h vr8, vr29, vr28 3213 vilvh.h vr9, vr29, vr28 3214 vilvl.h vr28, vr9, vr8 3215 vilvh.h vr29, vr9, vr8 3216 3217 vsrari.h vr26, vr26, 1 // in0low in1low 3218 vsrari.h vr27, vr27, 1 // in2low in3low 3219 vsrari.h vr28, vr28, 1 // in0high in1high 3220 vsrari.h vr29, vr29, 1 // in2high in3high 3221 3222 vexth.w.h vr18, vr4 3223 vexth.w.h vr19, vr5 3224 vexth.w.h vr6, vr12 3225 vexth.w.h vr7, vr13 3226 vexth.w.h vr8, vr14 3227 vexth.w.h vr9, vr15 3228 vexth.w.h vr10, vr24 3229 vexth.w.h vr11, vr25 3230 la.local t0, idct_coeffs 3231 vldrepl.w vr20, t0, 8 // 1567 3232 vldrepl.w vr21, t0, 12 // 3784 3233 vldrepl.w vr22, t0, 0 // 2896 3234 dct_8x4_core_lsx1 vr12, vr13, vr14, vr15 3235 vshuf4i.d vr13, vr13, 0x01 3236 vshuf4i.d vr15, vr15, 0x01 3237 3238 vilvl.h vr8, vr13, vr12 3239 vilvh.h vr9, vr13, vr12 3240 vilvl.h vr12, vr9, vr8 3241 vilvh.h vr13, vr9, vr8 3242 vilvl.h vr8, vr15, vr14 3243 vilvh.h vr9, vr15, vr14 3244 vilvl.h vr14, vr9, vr8 3245 vilvh.h vr15, vr9, vr8 3246 3247 vsrari.h vr0, vr12, 1 3248 vsrari.h vr1, vr13, 1 3249 vsrari.h vr2, vr14, 1 3250 vsrari.h vr3, vr15, 1 3251 3252 vreplgr2vr.h vr23, zero 3253.irp i, 0, 16, 32, 48, 64, 80, 96, 112 3254 vst vr23, a2, \i 3255.endr 3256 3257 vsllwil.w.h vr18, vr28, 0 // in0 3258 vexth.w.h vr19, vr28 // in1 3259 vsllwil.w.h vr6, vr29, 0 // in2 3260 vexth.w.h vr7, vr29 // in3 3261 vsllwil.w.h vr8, vr2, 0 // in4 3262 vexth.w.h vr9, vr2 // in5 3263 vsllwil.w.h vr10, vr3, 0 // in6 3264 vexth.w.h vr11, vr3 // in7 3265 adst8x8_1d_lsx vr4, vr5, vr16, vr17 3266 3267 vsllwil.w.h vr18, vr26, 0 // in0 3268 vexth.w.h vr19, vr26 // in1 3269 vsllwil.w.h vr6, vr27, 0 // in2 3270 vexth.w.h vr7, vr27 // in3 3271 vsllwil.w.h vr8, vr0, 0 // in4 3272 vexth.w.h vr9, vr0 // in5 3273 vsllwil.w.h vr10, vr1, 0 // in6 3274 vexth.w.h vr11, vr1 // in7 3275 adst8x8_1d_lsx vr0, vr1, vr2, vr3 3276 3277 vilvh.d vr26, vr4, vr0 3278 vilvh.d vr27, vr5, vr1 3279 vilvh.d vr28, vr16, vr2 3280 vilvh.d vr29, vr17, vr3 3281 vilvl.d vr20, vr17, vr3 3282 vilvl.d vr21, vr16, vr2 3283 vilvl.d vr22, vr5, vr1 3284 vilvl.d vr23, vr4, vr0 3285 3286.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23 3287 vsrari.h \i, \i, 4 3288.endr 3289 3290 alsl.d t2, a1, a0, 1 3291 3292 VLD_DST_ADD_W8 vr26, vr27, vr28, vr29 3293 3294 alsl.d a0, a1, a0, 2 3295 alsl.d t2, a1, a0, 1 3296 3297 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 3298 3299 fld.d f24, sp, 0 3300 fld.d f25, sp, 8 3301 fld.d f26, sp, 16 3302 fld.d f27, sp, 24 3303 fld.d f28, sp, 32 3304 fld.d f29, sp, 40 3305 addi.d sp, sp, 48 3306endfunc 3307 3308function inv_txfm_add_flipadst_flipadst_8x8_8bpc_lsx 3309 addi.d sp, sp, -32 3310 fst.d f24, sp, 0 3311 fst.d f25, sp, 8 3312 fst.d f26, sp, 16 3313 fst.d f27, sp, 24 3314 3315 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 3316 3317 vsllwil.w.h vr18, vr0, 0 3318 vsllwil.w.h vr19, vr1, 0 3319 vsllwil.w.h vr6, vr2, 0 3320 vsllwil.w.h vr7, vr3, 0 3321 vsllwil.w.h vr8, vr4, 0 3322 vsllwil.w.h vr9, vr5, 0 3323 vsllwil.w.h vr10, vr16, 0 3324 vsllwil.w.h vr11, vr17, 0 3325 adst8x8_1d_lsx vr12, vr13, vr14, vr15 3326 3327 vilvl.h vr20, vr12, vr13 3328 vilvl.h vr21, vr14, vr15 3329 vilvl.w vr24, vr20, vr21 3330 vilvh.w vr25, vr20, vr21 3331 vilvh.h vr20, vr12, vr13 3332 vilvh.h vr21, vr14, vr15 3333 vilvl.w vr26, vr20, vr21 3334 vilvh.w vr27, vr20, vr21 3335 vshuf4i.h vr26, vr26, 0x1b 3336 vshuf4i.h vr27, vr27, 0x1b 3337 3338 vexth.w.h vr18, vr0 3339 vexth.w.h vr19, vr1 3340 vexth.w.h vr6, vr2 3341 vexth.w.h vr7, vr3 3342 vexth.w.h vr8, vr4 3343 vexth.w.h vr9, vr5 3344 vexth.w.h vr10, vr16 3345 vexth.w.h vr11, vr17 3346 adst8x8_1d_lsx vr12, vr13, vr14, vr15 3347 3348 vilvl.h vr20, vr12, vr13 3349 vilvl.h vr21, vr14, vr15 3350 vilvl.w vr0, vr20, vr21 3351 vilvh.w vr1, vr20, vr21 3352 vilvh.h vr20, vr12, vr13 3353 vilvh.h vr21, vr14, vr15 3354 vilvl.w vr2, vr20, vr21 3355 vilvh.w vr3, vr20, vr21 3356 vshuf4i.h vr2, vr2, 0x1b 3357 vshuf4i.h vr3, vr3, 0x1b 3358 3359.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 3360 vsrari.h \i, \i, 1 3361.endr 3362 3363 vreplgr2vr.h vr23, zero 3364.irp i, 0, 16, 32, 48, 64, 80, 96, 112 3365 vst vr23, a2, \i 3366.endr 3367 3368 vsllwil.w.h vr18, vr26, 0 // in0 3369 vexth.w.h vr19, vr26 // in1 3370 vsllwil.w.h vr6, vr27, 0 // in2 3371 vexth.w.h vr7, vr27 // in3 3372 vsllwil.w.h vr8, vr2, 0 // in4 3373 vexth.w.h vr9, vr2 // in5 3374 vsllwil.w.h vr10, vr3, 0 // in6 3375 vexth.w.h vr11, vr3 // in7 3376 adst8x8_1d_lsx vr4, vr5, vr16, vr17 3377 3378 vsllwil.w.h vr18, vr24, 0 // in0 3379 vexth.w.h vr19, vr24 // in1 3380 vsllwil.w.h vr6, vr25, 0 // in2 3381 vexth.w.h vr7, vr25 // in3 3382 vsllwil.w.h vr8, vr0, 0 // in4 3383 vexth.w.h vr9, vr0 // in5 3384 vsllwil.w.h vr10, vr1, 0 // in6 3385 vexth.w.h vr11, vr1 // in7 3386 adst8x8_1d_lsx vr0, vr1, vr2, vr3 3387 3388 vilvh.d vr24, vr0, vr4 3389 vilvh.d vr25, vr1, vr5 3390 vilvh.d vr26, vr2, vr16 3391 vilvh.d vr27, vr3, vr17 3392 vilvl.d vr20, vr3, vr17 3393 vilvl.d vr21, vr2, vr16 3394 vilvl.d vr22, vr1, vr5 3395 vilvl.d vr23, vr0, vr4 3396 3397.irp i, vr24, vr25, vr26, vr27, vr20, vr21, vr22, vr23 3398 vsrari.h \i, \i, 4 3399.endr 3400 3401 alsl.d t2, a1, a0, 1 3402 3403 VLD_DST_ADD_W8 vr24, vr25, vr26, vr27 3404 3405 alsl.d a0, a1, a0, 2 3406 alsl.d t2, a1, a0, 1 3407 3408 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 3409 3410 fld.d f24, sp, 0 3411 fld.d f25, sp, 8 3412 fld.d f26, sp, 16 3413 fld.d f27, sp, 24 3414 addi.d sp, sp, 32 3415endfunc 3416 3417function inv_txfm_add_dct_identity_8x8_8bpc_lsx 3418 addi.d sp, sp, -48 3419 fst.d f24, sp, 0 3420 fst.d f25, sp, 8 3421 fst.d f26, sp, 16 3422 fst.d f27, sp, 24 3423 fst.d f28, sp, 32 3424 fst.d f29, sp, 40 3425 3426 vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25 3427 3428 la.local t0, idct_coeffs 3429 vldrepl.w vr20, t0, 8 // 1567 3430 vldrepl.w vr21, t0, 12 // 3784 3431 vldrepl.w vr22, t0, 0 // 2896 3432 3433 vsllwil.w.h vr18, vr4, 0 3434 vsllwil.w.h vr19, vr5, 0 3435 vsllwil.w.h vr6, vr12, 0 3436 vsllwil.w.h vr7, vr13, 0 3437 vsllwil.w.h vr8, vr14, 0 3438 vsllwil.w.h vr9, vr15, 0 3439 vsllwil.w.h vr10, vr24, 0 3440 vsllwil.w.h vr11, vr25, 0 3441 dct_8x4_core_lsx1 vr26, vr27, vr28, vr29 3442 vshuf4i.d vr27, vr27, 0x01 3443 vshuf4i.d vr29, vr29, 0x01 3444 3445 vilvl.h vr8, vr27, vr26 3446 vilvh.h vr9, vr27, vr26 3447 vilvl.h vr26, vr9, vr8 3448 vilvh.h vr27, vr9, vr8 3449 vilvl.h vr8, vr29, vr28 3450 vilvh.h vr9, vr29, vr28 3451 vilvl.h vr28, vr9, vr8 3452 vilvh.h vr29, vr9, vr8 3453 3454 vsrari.h vr26, vr26, 1 // in0low in1low 3455 vsrari.h vr27, vr27, 1 // in2low in3low 3456 vsrari.h vr28, vr28, 1 // in0high in1high 3457 vsrari.h vr29, vr29, 1 // in2high in3high 3458 3459 vexth.w.h vr18, vr4 3460 vexth.w.h vr19, vr5 3461 vexth.w.h vr6, vr12 3462 vexth.w.h vr7, vr13 3463 vexth.w.h vr8, vr14 3464 vexth.w.h vr9, vr15 3465 vexth.w.h vr10, vr24 3466 vexth.w.h vr11, vr25 3467 3468 la.local t0, idct_coeffs 3469 vldrepl.w vr20, t0, 8 // 1567 3470 vldrepl.w vr21, t0, 12 // 3784 3471 vldrepl.w vr22, t0, 0 // 2896 3472 3473 dct_8x4_core_lsx1 vr12, vr13, vr14, vr15 3474 3475 vshuf4i.d vr13, vr13, 0x01 3476 vshuf4i.d vr15, vr15, 0x01 3477 3478 vilvl.h vr8, vr13, vr12 3479 vilvh.h vr9, vr13, vr12 3480 vilvl.h vr12, vr9, vr8 3481 vilvh.h vr13, vr9, vr8 3482 vilvl.h vr8, vr15, vr14 3483 vilvh.h vr9, vr15, vr14 3484 vilvl.h vr14, vr9, vr8 3485 vilvh.h vr15, vr9, vr8 3486 3487 vsrari.h vr20, vr12, 1 3488 vsrari.h vr21, vr13, 1 3489 vsrari.h vr22, vr14, 1 3490 vsrari.h vr23, vr15, 1 3491 3492 vreplgr2vr.h vr19, zero 3493.irp i, 0, 16, 32, 48, 64, 80, 96, 112 3494 vst vr19, a2, \i 3495.endr 3496 // identity8 3497 vsllwil.w.h vr10, vr26, 1 3498 vsllwil.w.h vr11, vr27, 1 3499 vsllwil.w.h vr16, vr28, 1 3500 vsllwil.w.h vr17, vr29, 1 3501 vsllwil.w.h vr6, vr20, 1 3502 vsllwil.w.h vr7, vr21, 1 3503 vsllwil.w.h vr18, vr22, 1 3504 vsllwil.w.h vr19, vr23, 1 3505 3506.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23 3507 vexth.w.h \i, \i 3508.endr 3509 3510.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23 3511 vslli.w \i, \i, 1 3512.endr 3513 3514 vssrarni.h.w vr16, vr10, 4 // in0 3515 vssrarni.h.w vr28, vr26, 4 // in1 3516 vssrarni.h.w vr17, vr11, 4 // in2 3517 vssrarni.h.w vr29, vr27, 4 // in3 3518 vssrarni.h.w vr18, vr6, 4 // in4 3519 vssrarni.h.w vr22, vr20, 4 // in5 3520 vssrarni.h.w vr19, vr7, 4 // in6 3521 vssrarni.h.w vr23, vr21, 4 // in7 3522 3523 alsl.d t2, a1, a0, 1 3524 3525 VLD_DST_ADD_W8 vr16, vr28, vr17, vr29 3526 3527 alsl.d a0, a1, a0, 2 3528 alsl.d t2, a1, a0, 1 3529 3530 VLD_DST_ADD_W8 vr18, vr22, vr19, vr23 3531 3532 fld.d f24, sp, 0 3533 fld.d f25, sp, 8 3534 fld.d f26, sp, 16 3535 fld.d f27, sp, 24 3536 fld.d f28, sp, 32 3537 fld.d f29, sp, 40 3538 addi.d sp, sp, 48 3539endfunc 3540 3541function inv_txfm_add_identity_dct_8x8_8bpc_lsx 3542 addi.d sp, sp, -48 3543 fst.d f24, sp, 0 3544 fst.d f25, sp, 8 3545 fst.d f26, sp, 16 3546 fst.d f27, sp, 24 3547 fst.d f28, sp, 32 3548 fst.d f29, sp, 40 3549 3550 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 3551 3552 // identity8 3553 vsllwil.w.h vr6, vr0, 1 3554 vsllwil.w.h vr7, vr1, 1 3555 vsllwil.w.h vr8, vr2, 1 3556 vsllwil.w.h vr9, vr3, 1 3557 vsllwil.w.h vr10, vr4, 1 3558 vsllwil.w.h vr11, vr5, 1 3559 vsllwil.w.h vr12, vr24, 1 3560 vsllwil.w.h vr13, vr25, 1 3561 3562.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 3563 vexth.w.h \i, \i 3564.endr 3565 3566.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 3567 vslli.w \i, \i, 1 3568.endr 3569 vssrarni.h.w vr0, vr6, 1 // in0 3570 vssrarni.h.w vr1, vr7, 1 // in1 3571 vssrarni.h.w vr2, vr8, 1 // in2 3572 vssrarni.h.w vr3, vr9, 1 // in3 3573 vssrarni.h.w vr4, vr10, 1 // in4 3574 vssrarni.h.w vr5, vr11, 1 // in5 3575 vssrarni.h.w vr24, vr12, 1 // in6 3576 vssrarni.h.w vr25, vr13, 1 // in7 3577 3578 vreplgr2vr.h vr23, zero 3579.irp i, 0, 16, 32, 48, 64, 80, 96, 112 3580 vst vr23, a2, \i 3581.endr 3582 3583 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ 3584 vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25, \ 3585 vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 3586 3587 la.local t0, idct_coeffs 3588 vldrepl.w vr20, t0, 8 // 1567 3589 vldrepl.w vr21, t0, 12 // 3784 3590 vldrepl.w vr22, t0, 0 // 2896 3591 3592 // dct4 in0 in2 in4 in6 3593 vsllwil.w.h vr18, vr4, 0 3594 vsllwil.w.h vr19, vr5, 0 3595 vsllwil.w.h vr6, vr12, 0 3596 vsllwil.w.h vr7, vr13, 0 3597 vsllwil.w.h vr8, vr14, 0 3598 vsllwil.w.h vr9, vr15, 0 3599 vsllwil.w.h vr10, vr24, 0 3600 vsllwil.w.h vr11, vr25, 0 3601 dct_8x4_core_lsx1 vr16, vr17, vr26, vr27 3602 3603 vexth.w.h vr18, vr4 3604 vexth.w.h vr19, vr5 3605 vexth.w.h vr6, vr12 3606 vexth.w.h vr7, vr13 3607 vexth.w.h vr8, vr14 3608 vexth.w.h vr9, vr15 3609 vexth.w.h vr10, vr24 3610 vexth.w.h vr11, vr25 3611 3612 la.local t0, idct_coeffs 3613 vldrepl.w vr20, t0, 8 // 1567 3614 vldrepl.w vr21, t0, 12 // 3784 3615 vldrepl.w vr22, t0, 0 // 2896 3616 dct_8x4_core_lsx1 vr4, vr5, vr24, vr25 3617 3618 vilvl.d vr8, vr4, vr16 3619 vilvh.d vr9, vr4, vr16 3620 vilvh.d vr6, vr5, vr17 3621 vilvl.d vr7, vr5, vr17 3622 vilvl.d vr16, vr24, vr26 3623 vilvh.d vr17, vr24, vr26 3624 vilvh.d vr18, vr25, vr27 3625 vilvl.d vr19, vr25, vr27 3626 3627.irp i, vr8, vr9, vr6, vr7, vr16, vr17, vr18, vr19 3628 vsrari.h \i, \i, 4 3629.endr 3630 3631 alsl.d t2, a1, a0, 1 3632 3633 VLD_DST_ADD_W8 vr8, vr9, vr6, vr7 3634 3635 alsl.d a0, a1, a0, 2 3636 alsl.d t2, a1, a0, 1 3637 3638 VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 3639 3640 fld.d f24, sp, 0 3641 fld.d f25, sp, 8 3642 fld.d f26, sp, 16 3643 fld.d f27, sp, 24 3644 fld.d f28, sp, 32 3645 fld.d f29, sp, 40 3646 addi.d sp, sp, 48 3647endfunc 3648 3649function inv_txfm_add_flipadst_identity_8x8_8bpc_lsx 3650 addi.d sp, sp, -32 3651 fst.d f24, sp, 0 3652 fst.d f25, sp, 8 3653 fst.d f26, sp, 16 3654 fst.d f27, sp, 24 3655 3656 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 3657 3658 vsllwil.w.h vr18, vr0, 0 3659 vsllwil.w.h vr19, vr1, 0 3660 vsllwil.w.h vr6, vr2, 0 3661 vsllwil.w.h vr7, vr3, 0 3662 vsllwil.w.h vr8, vr4, 0 3663 vsllwil.w.h vr9, vr5, 0 3664 vsllwil.w.h vr10, vr16, 0 3665 vsllwil.w.h vr11, vr17, 0 3666 adst8x8_1d_lsx vr12, vr13, vr14, vr15 3667 3668 vilvl.h vr20, vr12, vr13 3669 vilvl.h vr21, vr14, vr15 3670 vilvl.w vr24, vr20, vr21 3671 vilvh.w vr25, vr20, vr21 3672 vilvh.h vr20, vr12, vr13 3673 vilvh.h vr21, vr14, vr15 3674 vilvl.w vr26, vr20, vr21 3675 vilvh.w vr27, vr20, vr21 3676 vshuf4i.h vr26, vr26, 0x1b 3677 vshuf4i.h vr27, vr27, 0x1b 3678 3679 vexth.w.h vr18, vr0 // in0 3680 vexth.w.h vr19, vr1 // in1 3681 vexth.w.h vr6, vr2 // in2 3682 vexth.w.h vr7, vr3 // in3 3683 vexth.w.h vr8, vr4 // in3 3684 vexth.w.h vr9, vr5 // in4 3685 vexth.w.h vr10, vr16 // in5 3686 vexth.w.h vr11, vr17 // in6 3687 adst8x8_1d_lsx vr12, vr13, vr14, vr15 3688 3689 vilvl.h vr20, vr12, vr13 3690 vilvl.h vr21, vr14, vr15 3691 vilvl.w vr16, vr20, vr21 3692 vilvh.w vr17, vr20, vr21 3693 vilvh.h vr20, vr12, vr13 3694 vilvh.h vr21, vr14, vr15 3695 vilvl.w vr18, vr20, vr21 3696 vilvh.w vr19, vr20, vr21 3697 vshuf4i.h vr18, vr18, 0x1b 3698 vshuf4i.h vr19, vr19, 0x1b 3699 3700 vreplgr2vr.h vr23, zero 3701.irp i, 0, 16, 32, 48, 64, 80, 96, 112 3702 vst vr23, a2, \i 3703.endr 3704 3705.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19 3706 vsrari.h \i, \i, 1 3707.endr 3708 3709 // identity8 3710 vsllwil.w.h vr20, vr24, 1 3711 vsllwil.w.h vr21, vr25, 1 3712 vsllwil.w.h vr12, vr26, 1 3713 vsllwil.w.h vr13, vr27, 1 3714 vsllwil.w.h vr22, vr16, 1 3715 vsllwil.w.h vr23, vr17, 1 3716 vsllwil.w.h vr14, vr18, 1 3717 vsllwil.w.h vr15, vr19, 1 3718 3719.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19 3720 vexth.w.h \i, \i 3721.endr 3722 3723.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19 3724 vslli.w \i, \i, 1 3725.endr 3726 3727 vssrarni.h.w vr20, vr12, 4 // in0 3728 vssrarni.h.w vr24, vr26, 4 // in1 3729 vssrarni.h.w vr21, vr13, 4 // in2 3730 vssrarni.h.w vr25, vr27, 4 // in3 3731 vssrarni.h.w vr22, vr14, 4 // in4 3732 vssrarni.h.w vr16, vr18, 4 // in5 3733 vssrarni.h.w vr23, vr15, 4 // in6 3734 vssrarni.h.w vr17, vr19, 4 // in7 3735 3736 alsl.d t2, a1, a0, 1 3737 3738 VLD_DST_ADD_W8 vr20, vr24, vr21, vr25 3739 3740 alsl.d a0, a1, a0, 2 3741 alsl.d t2, a1, a0, 1 3742 3743 VLD_DST_ADD_W8 vr22, vr16, vr23, vr17 3744 3745 fld.d f24, sp, 0 3746 fld.d f25, sp, 8 3747 fld.d f26, sp, 16 3748 fld.d f27, sp, 24 3749 addi.d sp, sp, 32 3750endfunc 3751 3752function inv_txfm_add_identity_flipadst_8x8_8bpc_lsx 3753 addi.d sp, sp, -48 3754 fst.d f24, sp, 0 3755 fst.d f25, sp, 8 3756 fst.d f26, sp, 16 3757 fst.d f27, sp, 24 3758 fst.d f28, sp, 32 3759 fst.d f29, sp, 40 3760 3761 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 3762 3763 // identity8 3764 vsllwil.w.h vr6, vr0, 1 3765 vsllwil.w.h vr7, vr1, 1 3766 vsllwil.w.h vr8, vr2, 1 3767 vsllwil.w.h vr9, vr3, 1 3768 vsllwil.w.h vr10, vr4, 1 3769 vsllwil.w.h vr11, vr5, 1 3770 vsllwil.w.h vr12, vr24, 1 3771 vsllwil.w.h vr13, vr25, 1 3772 3773.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 3774 vexth.w.h \i, \i 3775.endr 3776 3777.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 3778 vslli.w \i, \i, 1 3779.endr 3780 3781 vssrarni.h.w vr0, vr6, 1 // in0 3782 vssrarni.h.w vr1, vr7, 1 // in1 3783 vssrarni.h.w vr2, vr8, 1 // in2 3784 vssrarni.h.w vr3, vr9, 1 // in3 3785 vssrarni.h.w vr4, vr10, 1 // in4 3786 vssrarni.h.w vr5, vr11, 1 // in5 3787 vssrarni.h.w vr24, vr12, 1 // in6 3788 vssrarni.h.w vr25, vr13, 1 // in7 3789 3790 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ 3791 vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ 3792 vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13 3793 3794 vreplgr2vr.h vr23, zero 3795.irp i, 0, 16, 32, 48, 64, 80, 96, 112 3796 vst vr23, a2, \i 3797.endr 3798 3799 vsllwil.w.h vr18, vr0, 0 // in0 3800 vsllwil.w.h vr19, vr1, 0 // in1 3801 vsllwil.w.h vr6, vr2, 0 // in2 3802 vsllwil.w.h vr7, vr3, 0 // in3 3803 vsllwil.w.h vr8, vr4, 0 // in3 3804 vsllwil.w.h vr9, vr5, 0 // in4 3805 vsllwil.w.h vr10, vr24, 0 // in5 3806 vsllwil.w.h vr11, vr25, 0 // in6 3807 adst8x8_1d_lsx vr26, vr27, vr28, vr29 3808 3809 vexth.w.h vr18, vr0 // in0 3810 vexth.w.h vr19, vr1 // in1 3811 vexth.w.h vr6, vr2 // in2 3812 vexth.w.h vr7, vr3 // in3 3813 vexth.w.h vr8, vr4 // in3 3814 vexth.w.h vr9, vr5 // in4 3815 vexth.w.h vr10, vr24 // in5 3816 vexth.w.h vr11, vr25 // in6 3817 adst8x8_1d_lsx vr0, vr1, vr2, vr3 3818 3819 vilvh.d vr4, vr0, vr26 3820 vilvh.d vr5, vr1, vr27 3821 vilvh.d vr6, vr2, vr28 3822 vilvh.d vr7, vr3, vr29 3823 vilvl.d vr14, vr3, vr29 3824 vilvl.d vr15, vr2, vr28 3825 vilvl.d vr16, vr1, vr27 3826 vilvl.d vr17, vr0, vr26 3827 3828.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17 3829 vsrari.h \i, \i, 4 3830.endr 3831 3832 alsl.d t2, a1, a0, 1 3833 3834 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 3835 3836 alsl.d a0, a1, a0, 2 3837 alsl.d t2, a1, a0, 1 3838 3839 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 3840 3841 fld.d f24, sp, 0 3842 fld.d f25, sp, 8 3843 fld.d f26, sp, 16 3844 fld.d f27, sp, 24 3845 fld.d f28, sp, 32 3846 fld.d f29, sp, 40 3847 addi.d sp, sp, 48 3848 3849endfunc 3850 3851function inv_txfm_add_adst_identity_8x8_8bpc_lsx 3852 addi.d sp, sp, -32 3853 fst.d f24, sp, 0 3854 fst.d f25, sp, 8 3855 fst.d f26, sp, 16 3856 fst.d f27, sp, 24 3857 3858 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 3859 3860 vsllwil.w.h vr18, vr0, 0 3861 vsllwil.w.h vr19, vr1, 0 3862 vsllwil.w.h vr6, vr2, 0 3863 vsllwil.w.h vr7, vr3, 0 3864 vsllwil.w.h vr8, vr4, 0 3865 vsllwil.w.h vr9, vr5, 0 3866 vsllwil.w.h vr10, vr16, 0 3867 vsllwil.w.h vr11, vr17, 0 3868 adst8x8_1d_lsx vr24, vr25, vr26, vr27 3869 3870 vexth.w.h vr18, vr0 3871 vexth.w.h vr19, vr1 3872 vexth.w.h vr6, vr2 3873 vexth.w.h vr7, vr3 3874 vexth.w.h vr8, vr4 3875 vexth.w.h vr9, vr5 3876 vexth.w.h vr10, vr16 3877 vexth.w.h vr11, vr17 3878 adst8x8_1d_lsx vr0, vr1, vr2, vr3 3879 3880 vreplgr2vr.h vr23, zero 3881.irp i, 0, 16, 32, 48, 64, 80, 96, 112 3882 vst vr23, a2, \i 3883.endr 3884 3885.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 3886 vsrari.h \i, \i, 1 3887.endr 3888 3889 LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ 3890 vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23, \ 3891 vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 3892 3893 vshuf4i.h vr26, vr26, 0x1b 3894 vshuf4i.h vr27, vr27, 0x1b 3895 vshuf4i.h vr22, vr22, 0x1b 3896 vshuf4i.h vr23, vr23, 0x1b 3897 3898 // identity8 3899 vsllwil.w.h vr16, vr24, 1 3900 vsllwil.w.h vr17, vr25, 1 3901 vsllwil.w.h vr10, vr20, 1 3902 vsllwil.w.h vr11, vr21, 1 3903 vsllwil.w.h vr18, vr26, 1 3904 vsllwil.w.h vr19, vr27, 1 3905 vsllwil.w.h vr14, vr22, 1 3906 vsllwil.w.h vr15, vr23, 1 3907 3908.irp i, vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23 3909 vexth.w.h \i, \i 3910.endr 3911 3912.irp i, vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23 3913 vslli.w \i, \i, 1 3914.endr 3915 3916 vssrarni.h.w vr18, vr16, 4 // in0 3917 vssrarni.h.w vr19, vr17, 4 // in1 3918 vssrarni.h.w vr14, vr10, 4 // in2 3919 vssrarni.h.w vr15, vr11, 4 // in3 3920 vssrarni.h.w vr26, vr24, 4 // in4 3921 vssrarni.h.w vr27, vr25, 4 // in5 3922 vssrarni.h.w vr22, vr20, 4 // in6 3923 vssrarni.h.w vr23, vr21, 4 // in7 3924 3925 alsl.d t2, a1, a0, 1 3926 3927 VLD_DST_ADD_W8 vr18, vr19, vr14, vr15 3928 3929 alsl.d a0, a1, a0, 2 3930 alsl.d t2, a1, a0, 1 3931 3932 VLD_DST_ADD_W8 vr26, vr27, vr22, vr23 3933 3934 fld.d f24, sp, 0 3935 fld.d f25, sp, 8 3936 fld.d f26, sp, 16 3937 fld.d f27, sp, 24 3938 addi.d sp, sp, 32 3939endfunc 3940 3941function inv_txfm_add_identity_adst_8x8_8bpc_lsx 3942 addi.d sp, sp, -48 3943 fst.d f24, sp, 0 3944 fst.d f25, sp, 8 3945 fst.d f26, sp, 16 3946 fst.d f27, sp, 24 3947 fst.d f28, sp, 32 3948 fst.d f29, sp, 40 3949 3950 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 3951 3952 // identity8 3953 vsllwil.w.h vr6, vr0, 1 3954 vsllwil.w.h vr7, vr1, 1 3955 vsllwil.w.h vr8, vr2, 1 3956 vsllwil.w.h vr9, vr3, 1 3957 vsllwil.w.h vr10, vr4, 1 3958 vsllwil.w.h vr11, vr5, 1 3959 vsllwil.w.h vr12, vr24, 1 3960 vsllwil.w.h vr13, vr25, 1 3961 3962.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 3963 vexth.w.h \i, \i 3964.endr 3965 3966.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 3967 vslli.w \i, \i, 1 3968.endr 3969 3970 vssrarni.h.w vr0, vr6, 1 // in0 3971 vssrarni.h.w vr1, vr7, 1 // in1 3972 vssrarni.h.w vr2, vr8, 1 // in2 3973 vssrarni.h.w vr3, vr9, 1 // in3 3974 vssrarni.h.w vr4, vr10, 1 // in4 3975 vssrarni.h.w vr5, vr11, 1 // in5 3976 vssrarni.h.w vr24, vr12, 1 // in6 3977 vssrarni.h.w vr25, vr13, 1 // in7 3978 3979 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ 3980 vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ 3981 vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13 3982 3983 vreplgr2vr.h vr23, zero 3984 3985.irp i, 0, 16, 32, 48, 64, 80, 96, 112 3986 vst vr23, a2, \i 3987.endr 3988 3989 vsllwil.w.h vr18, vr0, 0 3990 vsllwil.w.h vr19, vr1, 0 3991 vsllwil.w.h vr6, vr2, 0 3992 vsllwil.w.h vr7, vr3, 0 3993 vsllwil.w.h vr8, vr4, 0 3994 vsllwil.w.h vr9, vr5, 0 3995 vsllwil.w.h vr10, vr24, 0 3996 vsllwil.w.h vr11, vr25, 0 3997 adst8x8_1d_lsx vr26, vr27, vr28, vr29 3998 3999 vexth.w.h vr18, vr0 4000 vexth.w.h vr19, vr1 4001 vexth.w.h vr6, vr2 4002 vexth.w.h vr7, vr3 4003 vexth.w.h vr8, vr4 4004 vexth.w.h vr9, vr5 4005 vexth.w.h vr10, vr24 4006 vexth.w.h vr11, vr25 4007 4008 adst8x8_1d_lsx vr0, vr1, vr2, vr3 4009 4010 vilvl.d vr4, vr0, vr26 // 0 ... 7 4011 vilvl.d vr5, vr1, vr27 // 8 ... 15 4012 vilvl.d vr6, vr2, vr28 // 16 ... 23 4013 vilvl.d vr7, vr3, vr29 // 24 ... 31 4014 vilvh.d vr14, vr3, vr29 // 32 ... 39 4015 vilvh.d vr15, vr2, vr28 // 40 ... 47 4016 vilvh.d vr16, vr1, vr27 // 48 ... 55 4017 vilvh.d vr17, vr0, vr26 // 56 ... 63 4018 4019.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17 4020 vsrari.h \i, \i, 4 4021.endr 4022 4023 alsl.d t2, a1, a0, 1 4024 4025 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 4026 4027 alsl.d a0, a1, a0, 2 4028 alsl.d t2, a1, a0, 1 4029 4030 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 4031 4032 fld.d f24, sp, 0 4033 fld.d f25, sp, 8 4034 fld.d f26, sp, 16 4035 fld.d f27, sp, 24 4036 fld.d f28, sp, 32 4037 fld.d f29, sp, 40 4038 addi.d sp, sp, 48 4039endfunc 4040 4041.macro vmul_vmadd_w in0, in1, in2, in3, out0, out1 4042 vsllwil.w.h vr22, \in0, 0 4043 vexth.w.h vr23, \in0 4044 vmul.w \out0, vr22, \in2 4045 vmul.w \out1, vr23, \in2 4046 vsllwil.w.h vr22, \in1, 0 4047 vexth.w.h vr23, \in1 4048 vmadd.w \out0, vr22, \in3 4049 vmadd.w \out1, vr23, \in3 4050.endm 4051 4052.macro vmul_vmsub_w in0, in1, in2, in3, out0, out1 4053 vsllwil.w.h vr22, \in0, 0 4054 vexth.w.h vr23, \in0 4055 vmul.w \out0, vr22, \in2 4056 vmul.w \out1, vr23, \in2 4057 vsllwil.w.h vr22, \in1, 0 4058 vexth.w.h vr23, \in1 4059 vmsub.w \out0, vr22, \in3 4060 vmsub.w \out1, vr23, \in3 4061.endm 4062 4063.macro rect2_lsx in0, in1, out0 4064 vsllwil.w.h vr22, \in0, 0 // in1 4065 vexth.w.h \in0, \in0 // in1 4066 vmul.w vr22, vr22, \in1 4067 vmul.w \out0, \in0, \in1 4068 vssrarni.h.w \out0, vr22, 12 4069.endm 4070 4071.macro dct_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, \ 4072 out1, out2, out3, out4, out5, out6, out7, rect2 4073 4074 la.local t0, idct_coeffs 4075 4076.ifc \rect2, rect2_lsx 4077 vldrepl.w vr23, t0, 0 // 2896 4078.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 4079 rect2_lsx \i, vr23, \i 4080.endr 4081.endif 4082 vldrepl.w vr20, t0, 8 // 1567 4083 vldrepl.w vr21, t0, 12 // 3784 4084 4085 vmul_vmadd_w \in2, \in6, vr21, vr20, vr8, vr9 4086 vssrarni.h.w vr9, vr8, 12 // t3 4087 vmul_vmsub_w \in2, \in6, vr20, vr21, vr8, vr10 4088 vssrarni.h.w vr10, vr8, 12 // t2 4089 4090 vldrepl.w vr20, t0, 0 // 2896 4091 vmul_vmadd_w \in0, \in4, vr20, vr20, vr8, \in2 4092 vssrarni.h.w \in2, vr8, 12 // t0 4093 vmul_vmsub_w \in0, \in4, vr20, vr20, vr8, \in6 4094 vssrarni.h.w \in6, vr8, 12 // t1 4095 4096 vsadd.h vr8, \in2, vr9 // c[0] 4097 vssub.h vr9, \in2, vr9 // c[3] 4098 vsadd.h \in0, \in6, vr10 // c[1] 4099 vssub.h vr10, \in6, vr10 // c[2] 4100 4101 vldrepl.w vr20, t0, 16 // 799 4102 vldrepl.w vr21, t0, 20 // 4017 4103 vmul_vmadd_w \in1, \in7, vr21, vr20, \in2, \in4 4104 vssrarni.h.w \in4, \in2, 12 // t7a 4105 vmul_vmsub_w \in1, \in7, vr20, vr21, \in2, \in6 4106 vssrarni.h.w \in6, \in2, 12 // t4a 4107 4108 vldrepl.w vr20, t0, 24 // 3406 4109 vldrepl.w vr21, t0, 28 // 2276 4110 vmul_vmadd_w \in5, \in3, vr21, vr20, \in2, \in1 4111 vssrarni.h.w \in1, \in2, 12 // t6a 4112 vmul_vmsub_w \in5, \in3, vr20, vr21, \in2, \in7 4113 vssrarni.h.w \in7, \in2, 12 // t5a 4114 4115 vsadd.h \in3, \in6, \in7 // t4 4116 vssub.h \in6, \in6, \in7 // t5a 4117 vsadd.h \in5, \in4, \in1 // t7 4118 vssub.h \in4, \in4, \in1 // t6a 4119 4120 vldrepl.w vr20, t0, 0 // 2896 4121 vmul_vmadd_w \in4, \in6, vr20, vr20, \in2, \in1 4122 vssrarni.h.w \in1, \in2, 12 // t6 4123 vmul_vmsub_w \in4, \in6, vr20, vr20, \in2, \in7 4124 vssrarni.h.w \in7, \in2, 12 // t5 4125 4126 vsadd.h \out0, vr8, \in5 // c[0] 4127 vssub.h \out7, vr8, \in5 // c[7] 4128 vsadd.h \out1, \in0, \in1 // c[1] 4129 vssub.h \out6, \in0, \in1 // c[6] 4130 vsadd.h \out2, vr10, \in7 // c[2] 4131 vssub.h \out5, vr10, \in7 // c[5] 4132 vsadd.h \out3, vr9, \in3 // c[3] 4133 vssub.h \out4, vr9, \in3 // c[4] 4134.endm 4135 4136function inv_txfm_add_dct_dct_8x8_8bpc_lsx 4137 bnez a3, .NO_HAS_DCONLY_8x8 4138 4139 ld.h t2, a2, 0 // dc 4140 vldi vr0, 0x8b5 // 181 4141 vreplgr2vr.w vr1, t2 4142 vldi vr5, 0x880 // 128 4143 vmul.w vr2, vr0, vr1 // dc * 181 4144 st.h zero, a2, 0 4145 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 4146 vld vr10, a0, 0 // 0 1 2 3 4 5 6 7 4147 vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift 4148 vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15 4149 alsl.d t2, a1, a0, 1 4150 vmadd.w vr5, vr2, vr0 4151 vld vr12, t2, 0 // 16 17 18 19 20 21 22 23 4152 vssrarni.h.w vr5, vr5, 12 4153 vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31 4154 4155 DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 4156 4157 alsl.d a0, a1, a0, 2 4158 alsl.d t2, a1, a0, 1 4159 4160 VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 4161 4162 b .DCT_DCT_8X8_END 4163 4164.NO_HAS_DCONLY_8x8: 4165 4166 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 4167 4168 la.local t0, idct_coeffs 4169 4170 dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4171 vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 4172 4173 LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 4174 vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 4175 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 4176 4177.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 4178 vsrari.h \i, \i, 1 4179.endr 4180 4181 vreplgr2vr.h vr23, zero 4182 4183.irp i, 0, 16, 32, 48, 64, 80, 96, 112 4184 vst vr23, a2, \i 4185.endr 4186 4187 dct_8x8_core_lsx vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 4188 vr4, vr5, vr6, vr7, vr20, vr21, vr22, vr23, no_rect2 4189 4190.irp i, vr4, vr5, vr6, vr7, vr20, vr21, vr22, vr23 4191 vsrari.h \i, \i, 4 4192.endr 4193 4194 alsl.d t2, a1, a0, 1 4195 4196 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 4197 4198 alsl.d a0, a1, a0, 2 4199 alsl.d t2, a1, a0, 1 4200 4201 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 4202 4203.DCT_DCT_8X8_END: 4204 4205endfunc 4206 4207.macro dct_8x16_core_lsx 4208 dct_8x8_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, \ 4209 vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 4210 4211 la.local t0, idct_coeffs 4212 vldrepl.w vr20, t0, 32 // 401 4213 vldrepl.w vr21, t0, 36 // 4076 4214 vmul_vmadd_w vr1, vr30, vr21, vr20, vr0, vr10 4215 vssrarni.h.w vr10, vr0, 12 // t15a 4216 vmul_vmsub_w vr1, vr30, vr20, vr21, vr0, vr29 4217 vssrarni.h.w vr29, vr0, 12 // t8a 4218 4219 vldrepl.w vr20, t0, 40 // 3166 -> 1583 4220 vldrepl.w vr21, t0, 44 // 2598 -> 1299 4221 vmul_vmadd_w vr24, vr7, vr21, vr20, vr0, vr30 4222 vssrarni.h.w vr30, vr0, 12 // t14a 4223 vmul_vmsub_w vr24, vr7, vr20, vr21, vr0, vr31 4224 vssrarni.h.w vr31, vr0, 12 // t9a 4225 4226 vldrepl.w vr20, t0, 48 // 1931 4227 vldrepl.w vr21, t0, 52 // 3612 4228 vmul_vmadd_w vr5, vr26, vr21, vr20, vr0, vr24 4229 vssrarni.h.w vr24, vr0, 12 // t13a 4230 vmul_vmsub_w vr5, vr26, vr20, vr21, vr0, vr25 4231 vssrarni.h.w vr25, vr0, 12 // t10a 4232 4233 vldrepl.w vr20, t0, 56 // 3920 4234 vldrepl.w vr21, t0, 60 // 1189 4235 vmul_vmadd_w vr28, vr3, vr21, vr20, vr0, vr26 4236 vssrarni.h.w vr26, vr0, 12 // t12a 4237 vmul_vmsub_w vr28, vr3, vr20, vr21, vr0, vr27 4238 vssrarni.h.w vr27, vr0, 12 // t11a 4239 4240 // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27 4241 vsadd.h vr28, vr29, vr31 // t8 4242 vssub.h vr19, vr29, vr31 // t9 4243 vssub.h vr29, vr27, vr25 // t10 4244 vsadd.h vr9, vr27, vr25 // t11 4245 vsadd.h vr31, vr26, vr24 // t12 4246 vssub.h vr25, vr26, vr24 // t13 4247 vssub.h vr27, vr10, vr30 // t14 4248 vsadd.h vr24, vr10, vr30 // t15 4249 4250 vldrepl.w vr20, t0, 8 // 1567 4251 vldrepl.w vr21, t0, 12 // 3784 4252 vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26 4253 vssrarni.h.w vr26, vr0, 12 // t14a 4254 vmul_vmsub_w vr27, vr19, vr20, vr21, vr0, vr30 4255 vssrarni.h.w vr30, vr0, 12 // t9a 4256 4257 vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19 4258 vneg.w vr0, vr0 4259 vneg.w vr19, vr19 4260 vssrarni.h.w vr19, vr0, 12 // t10a 4261 vmul_vmsub_w vr25, vr29, vr20, vr21, vr0, vr27 4262 vssrarni.h.w vr27, vr0, 12 // t13a 4263 4264 vsadd.h vr25, vr28, vr9 // t8a 4265 vssub.h vr29, vr28, vr9 // t11a 4266 vssub.h vr28, vr24, vr31 // t12a 4267 vsadd.h vr10, vr24, vr31 // t15a 4268 vsadd.h vr9, vr30, vr19 // t9 4269 vssub.h vr31, vr30, vr19 // t10 4270 vssub.h vr30, vr26, vr27 // t13 4271 vsadd.h vr24, vr26, vr27 // t14 4272 4273 vldrepl.w vr20, t0, 0 // 2896 4274 vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26 4275 vssrarni.h.w vr26, vr0, 12 // t13a 4276 vmul_vmsub_w vr30, vr31, vr20, vr20, vr0, vr27 4277 vssrarni.h.w vr27, vr0, 12 // t10a 4278 4279 vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31 4280 vssrarni.h.w vr31, vr0, 12 // t12 4281 vmul_vmsub_w vr28, vr29, vr20, vr20, vr0, vr30 4282 vssrarni.h.w vr30, vr0, 12 // t11 4283 4284 // vr11 vr12 ... vr18 4285 vsadd.h vr28, vr14, vr31 // c[3] 4286 vssub.h vr29, vr14, vr31 // c[12] 4287 vsadd.h vr20, vr15, vr30 // c[4] 4288 vssub.h vr21, vr15, vr30 // c[11] 4289 vsadd.h vr14, vr16, vr27 // c[5] 4290 vssub.h vr23, vr16, vr27 // c[10] 4291 vsadd.h vr15, vr17, vr9 // c[6] 4292 vssub.h vr30, vr17, vr9 // c[9] 4293 vsadd.h vr16, vr18, vr25 // c[7] 4294 vssub.h vr27, vr18, vr25 // c[8] 4295 vsadd.h vr17, vr13, vr26 // c[2] 4296 vssub.h vr26, vr13, vr26 // c[13] 4297 vsadd.h vr18, vr12, vr24 // c[1] 4298 vssub.h vr25, vr12, vr24 // c[14] 4299 vsadd.h vr22, vr11, vr10 // c[0] 4300 vssub.h vr24, vr11, vr10 // c[15] 4301.endm 4302 4303function inv_txfm_add_dct_dct_8x16_8bpc_lsx 4304 bnez a3, .NO_HAS_DCONLY_8x16 4305 4306 ld.h t2, a2, 0 // dc 4307 vldi vr0, 0x8b5 // 181 4308 vreplgr2vr.w vr1, t2 4309 vldi vr5, 0x880 // 128 4310 vmul.w vr2, vr0, vr1 // dc * 181 4311 st.h zero, a2, 0 4312 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 4313 vld vr10, a0, 0 // 0 1 2 3 4 5 6 7 4314 vmul.w vr2, vr0, vr2 4315 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 4316 vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift 4317 vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15 4318 alsl.d t2, a1, a0, 1 4319 vmadd.w vr5, vr2, vr0 4320 vld vr12, t2, 0 // 16 17 18 19 20 21 22 23 4321 vssrarni.h.w vr5, vr5, 12 4322 vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31 4323 4324 DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 4325 4326 alsl.d a0, a1, a0, 2 4327 alsl.d t2, a1, a0, 1 4328 4329 VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 4330 4331 alsl.d a0, a1, a0, 2 4332 alsl.d t2, a1, a0, 1 4333 4334 VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 4335 4336 alsl.d a0, a1, a0, 2 4337 alsl.d t2, a1, a0, 1 4338 4339 VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 4340 4341 b .DCT_DCT_8X16_END 4342 4343.NO_HAS_DCONLY_8x16: 4344 addi.d sp, sp, -64 4345 fst.d f24, sp, 0 4346 fst.d f25, sp, 8 4347 fst.d f26, sp, 16 4348 fst.d f27, sp, 24 4349 fst.d f28, sp, 32 4350 fst.d f29, sp, 40 4351 fst.d f30, sp, 48 4352 fst.d f31, sp, 56 4353 4354 vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 4355 4356 la.local t0, idct_coeffs 4357 4358 dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4359 vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx 4360 4361 vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 4362 4363 dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4364 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx 4365 4366.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 4367 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 4368 vsrari.h \i, \i, 1 4369.endr 4370 4371 vreplgr2vr.h vr23, zero 4372 4373.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 4374 vst vr23, a2, \i 4375.endr 4376 4377 LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 4378 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4379 vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 4380 4381 LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ 4382 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ 4383 vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 4384 4385 dct_8x16_core_lsx 4386 4387.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 4388 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 4389 vsrari.h \i, \i, 4 4390.endr 4391 4392 alsl.d t2, a1, a0, 1 4393 4394 VLD_DST_ADD_W8 vr22, vr18, vr17, vr28 4395 4396 alsl.d a0, a1, a0, 2 4397 alsl.d t2, a1, a0, 1 4398 4399 VLD_DST_ADD_W8 vr20, vr14, vr15, vr16 4400 4401 alsl.d a0, a1, a0, 2 4402 alsl.d t2, a1, a0, 1 4403 4404 VLD_DST_ADD_W8 vr27, vr30, vr23, vr21 4405 4406 alsl.d a0, a1, a0, 2 4407 alsl.d t2, a1, a0, 1 4408 4409 VLD_DST_ADD_W8 vr29, vr26, vr25, vr24 4410 4411 fld.d f24, sp, 0 4412 fld.d f25, sp, 8 4413 fld.d f26, sp, 16 4414 fld.d f27, sp, 24 4415 fld.d f28, sp, 32 4416 fld.d f29, sp, 40 4417 fld.d f30, sp, 48 4418 fld.d f31, sp, 56 4419 addi.d sp, sp, 64 4420.DCT_DCT_8X16_END: 4421endfunc 4422 4423.macro identity_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, rect2 4424 4425 la.local t0, idct_coeffs 4426 4427.ifc \rect2, rect2_lsx 4428 vldrepl.w vr23, t0, 0 // 2896 4429.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 4430 rect2_lsx \i, vr23, \i 4431.endr 4432.endif 4433 vsllwil.w.h vr8, \in0, 1 4434 vsllwil.w.h vr9, \in1, 1 4435 vsllwil.w.h vr10, \in2, 1 4436 vsllwil.w.h vr11, \in3, 1 4437 vsllwil.w.h vr12, \in4, 1 4438 vsllwil.w.h vr13, \in5, 1 4439 vsllwil.w.h vr14, \in6, 1 4440 vsllwil.w.h vr15, \in7, 1 4441 4442.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 4443 vexth.w.h \i, \i 4444.endr 4445 4446.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 4447 vslli.w \i, \i, 1 4448.endr 4449 4450 vssrarni.h.w \in0, vr8, 1 4451 vssrarni.h.w \in1, vr9, 1 4452 vssrarni.h.w \in2, vr10, 1 4453 vssrarni.h.w \in3, vr11, 1 4454 vssrarni.h.w \in4, vr12, 1 4455 vssrarni.h.w \in5, vr13, 1 4456 vssrarni.h.w \in6, vr14, 1 4457 vssrarni.h.w \in7, vr15, 1 4458.endm 4459 4460.macro identity_8x16_core_lsx in0, out0 4461 vsadd.h vr10, \in0, \in0 4462 vsllwil.w.h vr8, \in0, 0 4463 vexth.w.h \out0, \in0 4464 vmul.w vr8, vr8, vr20 4465 vmul.w \out0, \out0, vr20 4466 vssrarni.h.w \out0, vr8, 11 4467 vsadd.h \out0, \out0, vr10 4468.endm 4469 4470function inv_txfm_add_identity_identity_8x16_8bpc_lsx 4471 addi.d sp, sp, -64 4472 fst.d f24, sp, 0 4473 fst.d f25, sp, 8 4474 fst.d f26, sp, 16 4475 fst.d f27, sp, 24 4476 fst.d f28, sp, 32 4477 fst.d f29, sp, 40 4478 fst.d f30, sp, 48 4479 fst.d f31, sp, 56 4480 4481 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 4482 4483 identity_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, rect2_lsx 4484 4485 vld_x8 a2, 128, 16, vr16, vr17, vr18, vr19, vr24, vr25, vr26, vr27 4486 4487 identity_8x8_core_lsx vr16, vr17, vr18, vr19, vr24, vr25, vr26, vr27, rect2_lsx 4488 4489 vreplgr2vr.h vr23, zero 4490 4491.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 4492 vst vr23, a2, \i 4493.endr 4494 4495 4496 LSX_TRANSPOSE8x8_H vr0, vr2, vr4, vr6, vr16, vr18, vr24, vr26, \ 4497 vr14, vr15, vr22, vr23, vr16, vr18, vr24, vr26, \ 4498 vr8, vr9, vr10, vr11, vr12, vr13, vr20, vr21 4499 4500 LSX_TRANSPOSE8x8_H vr1, vr3, vr5, vr7, vr17, vr19, vr25, vr27, \ 4501 vr28, vr29, vr30, vr31, vr17, vr19, vr25, vr27, \ 4502 vr8, vr9, vr10, vr11, vr12, vr13, vr20, vr21 4503 4504 li.w t0, 1697 4505 vreplgr2vr.w vr20, t0 4506 4507.irp i, vr14, vr15, vr22, vr23, vr16, vr18, vr24, vr26, \ 4508 vr28, vr29, vr30, vr31, vr17, vr19, vr25, vr27 4509 identity_8x16_core_lsx \i, \i 4510 vsrari.h \i, \i, 4 4511.endr 4512 4513 alsl.d t2, a1, a0, 1 4514 4515 VLD_DST_ADD_W8 vr14, vr15, vr22, vr23 4516 4517 alsl.d a0, a1, a0, 2 4518 alsl.d t2, a1, a0, 1 4519 4520 VLD_DST_ADD_W8 vr16, vr18, vr24, vr26 4521 4522 alsl.d a0, a1, a0, 2 4523 alsl.d t2, a1, a0, 1 4524 4525 VLD_DST_ADD_W8 vr28, vr29, vr30, vr31 4526 4527 alsl.d a0, a1, a0, 2 4528 alsl.d t2, a1, a0, 1 4529 4530 VLD_DST_ADD_W8 vr17, vr19, vr25, vr27 4531 4532 fld.d f24, sp, 0 4533 fld.d f25, sp, 8 4534 fld.d f26, sp, 16 4535 fld.d f27, sp, 24 4536 fld.d f28, sp, 32 4537 fld.d f29, sp, 40 4538 fld.d f30, sp, 48 4539 fld.d f31, sp, 56 4540 addi.d sp, sp, 64 4541endfunc 4542 4543.macro adst_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 4544 out2, out3, out4, out5, out6, out7, rect2 4545 4546 la.local t0, iadst8_coeffs 4547 4548.ifc \rect2, rect2_lsx 4549 vldrepl.w vr23, t0, 32 // 2896 4550.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 4551 rect2_lsx \i, vr23, \i 4552.endr 4553.endif 4554 4555 vldrepl.w vr20, t0, 0 // 4076 4556 vldrepl.w vr21, t0, 4 // 401 4557 4558 vmul_vmadd_w vr7, vr0, vr20, vr21, vr8, vr9 4559 vssrarni.h.w vr9, vr8, 12 // t0a low 4560 vmul_vmsub_w vr7, vr0, vr21, vr20, vr8, vr10 4561 vssrarni.h.w vr10, vr8, 12 // t1a low 4562 4563 vldrepl.w vr20, t0, 8 // 3612 4564 vldrepl.w vr21, t0, 12 // 1931 4565 vmul_vmadd_w vr5, vr2, vr20, vr21, vr8, vr0 4566 vssrarni.h.w vr0, vr8, 12 // t2a low 4567 vmul_vmsub_w vr5, vr2, vr21, vr20, vr8, vr7 4568 vssrarni.h.w vr7, vr8, 12 // t3a low 4569 4570 vldrepl.w vr20, t0, 16 // 2598 -> 1299 4571 vldrepl.w vr21, t0, 20 // 3166 -> 1583 4572 vmul_vmadd_w vr3, vr4, vr20, vr21, vr8, vr2 4573 vssrarni.h.w vr2, vr8, 12 // t4a low 4574 vmul_vmsub_w vr3, vr4, vr21, vr20, vr8, vr5 4575 vssrarni.h.w vr5, vr8, 12 // t5a low 4576 4577 vldrepl.w vr20, t0, 24 // 1189 4578 vldrepl.w vr21, t0, 28 // 3920 4579 vmul_vmadd_w vr1, vr6, vr20, vr21, vr8, vr3 4580 vssrarni.h.w vr3, vr8, 12 // t6a low 4581 vmul_vmsub_w vr1, vr6, vr21, vr20, vr8, vr4 4582 vssrarni.h.w vr4, vr8, 12 // t7a low 4583 4584 vsadd.h vr1, vr9, vr2 // t0 4585 vssub.h vr6, vr9, vr2 // t4 4586 vsadd.h vr8, vr10, vr5 // t1 4587 vssub.h vr2, vr10, vr5 // t5 4588 vsadd.h vr9, vr0, vr3 // t2 4589 vssub.h vr5, vr0, vr3 // t6 4590 vsadd.h vr10, vr7, vr4 // t3 4591 vssub.h vr0, vr7, vr4 // t7 4592 4593 vldrepl.w vr20, t0, 40 // 1567 4594 vldrepl.w vr21, t0, 44 // 3784 4595 vmul_vmadd_w vr6, vr2, vr21, vr20, vr3, vr4 4596 vssrarni.h.w vr4, vr3, 12 // t4a low 4597 vmul_vmsub_w vr6, vr2, vr20, vr21, vr3, vr7 4598 vssrarni.h.w vr7, vr3, 12 // t5a low 4599 4600 vmul_vmadd_w vr0, vr5, vr20, vr21, vr3, vr2 4601 vssrarni.h.w vr2, vr3, 12 // t7a low 4602 vmul_vmsub_w vr0, vr5, vr21, vr20, vr3, vr6 4603 vssrarni.h.w vr6, vr3, 12 // t6a low 4604 4605 vsadd.h \out0, vr1, vr9 // out[0] 4606 vssub.h vr5, vr1, vr9 // t2 4607 vsadd.h vr3, vr8, vr10 // out[7] 4608 vssub.h vr1, vr8, vr10 // t3 4609 vexth.w.h vr9, vr3 4610 vsllwil.w.h vr21, vr3, 0 4611 vneg.w \out7, vr9 4612 vneg.w vr21, vr21 4613 vssrarni.h.w \out7, vr21, 0 // out[7] 4614 4615 vsadd.h vr8, vr4, vr6 // out[1] 4616 vssub.h vr10, vr4, vr6 // t6 4617 vexth.w.h vr20, vr8 4618 vsllwil.w.h vr21, vr8, 0 4619 vneg.w \out1, vr20 4620 vneg.w vr21, vr21 4621 vssrarni.h.w \out1, vr21, 0 // out[1] 4622 vsadd.h \out6, vr7, vr2 // out[6] 4623 vssub.h vr4, vr7, vr2 // t7 4624 4625 vldrepl.w vr20, t0, 32 // 2896 4626 vmul_vmadd_w vr5, vr1, vr20, vr20, vr9, vr6 4627 vssrarni.h.w vr6, vr9, 12 // out[3] 4628 vmul_vmsub_w vr5, vr1, vr20, vr20, vr9, \out4 4629 vssrarni.h.w \out4, vr9, 12 // out[4] 4630 4631 vmul_vmadd_w vr10, vr4, vr20, vr20, vr9, \out2 4632 vssrarni.h.w \out2, vr9, 12 // out[2] 4633 vmul_vmsub_w vr10, vr4, vr20, vr20, vr9, vr5 4634 vssrarni.h.w vr5, vr9, 12 // out[5] 4635 4636 vexth.w.h vr20, vr6 4637 vsllwil.w.h vr21, vr6, 0 4638 vneg.w \out3, vr20 4639 vneg.w vr21, vr21 4640 vssrarni.h.w \out3, vr21, 0 // out[3] 4641 4642 vexth.w.h vr20, vr5 4643 vsllwil.w.h vr21, vr5, 0 4644 vneg.w \out5, vr20 4645 vneg.w vr21, vr21 4646 vssrarni.h.w \out5, vr21, 0 // out[5] 4647.endm 4648 4649function inv_txfm_add_adst_dct_8x16_8bpc_lsx 4650 addi.d sp, sp, -64 4651 fst.d f24, sp, 0 4652 fst.d f25, sp, 8 4653 fst.d f26, sp, 16 4654 fst.d f27, sp, 24 4655 fst.d f28, sp, 32 4656 fst.d f29, sp, 40 4657 fst.d f30, sp, 48 4658 fst.d f31, sp, 56 4659 4660 vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 4661 4662 adst_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4663 vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx 4664 4665 vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 4666 4667 adst_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4668 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx 4669 4670.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 4671 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 4672 vsrari.h \i, \i, 1 4673.endr 4674 4675 vreplgr2vr.h vr23, zero 4676 4677.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 4678 vst vr23, a2, \i 4679.endr 4680 4681 LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 4682 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4683 vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 4684 4685 LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ 4686 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ 4687 vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 4688 4689 dct_8x8_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, \ 4690 vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 4691 4692 la.local t0, idct_coeffs 4693 vldrepl.w vr20, t0, 32 // 401 4694 vldrepl.w vr21, t0, 36 // 4076 4695 vmul_vmadd_w vr1, vr30, vr21, vr20, vr0, vr10 4696 vssrarni.h.w vr10, vr0, 12 // t15a 4697 vmul_vmsub_w vr1, vr30, vr20, vr21, vr0, vr29 4698 vssrarni.h.w vr29, vr0, 12 // t8a 4699 4700 vldrepl.w vr20, t0, 40 // 3166 -> 1583 4701 vldrepl.w vr21, t0, 44 // 2598 -> 1299 4702 vmul_vmadd_w vr24, vr7, vr21, vr20, vr0, vr30 4703 vssrarni.h.w vr30, vr0, 12 // t14a 4704 vmul_vmsub_w vr24, vr7, vr20, vr21, vr0, vr31 4705 vssrarni.h.w vr31, vr0, 12 // t9a 4706 4707 vldrepl.w vr20, t0, 48 // 1931 4708 vldrepl.w vr21, t0, 52 // 3612 4709 vmul_vmadd_w vr5, vr26, vr21, vr20, vr0, vr24 4710 vssrarni.h.w vr24, vr0, 12 // t13a 4711 vmul_vmsub_w vr5, vr26, vr20, vr21, vr0, vr25 4712 vssrarni.h.w vr25, vr0, 12 // t10a 4713 4714 vldrepl.w vr20, t0, 56 // 3920 4715 vldrepl.w vr21, t0, 60 // 1189 4716 vmul_vmadd_w vr28, vr3, vr21, vr20, vr0, vr26 4717 vssrarni.h.w vr26, vr0, 12 // t12a 4718 vmul_vmsub_w vr28, vr3, vr20, vr21, vr0, vr27 4719 vssrarni.h.w vr27, vr0, 12 // t11a 4720 4721 // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27 4722 vsadd.h vr28, vr29, vr31 // t8 4723 vssub.h vr19, vr29, vr31 // t9 4724 vssub.h vr29, vr27, vr25 // t10 4725 vsadd.h vr9, vr27, vr25 // t11 4726 vsadd.h vr31, vr26, vr24 // t12 4727 vssub.h vr25, vr26, vr24 // t13 4728 vssub.h vr27, vr10, vr30 // t14 4729 vsadd.h vr24, vr10, vr30 // t15 4730 4731 vldrepl.w vr20, t0, 8 // 1567 4732 vldrepl.w vr21, t0, 12 // 3784 4733 vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26 4734 vssrarni.h.w vr26, vr0, 12 // t14a 4735 vmul_vmsub_w vr27, vr19, vr20, vr21, vr0, vr30 4736 vssrarni.h.w vr30, vr0, 12 // t9a 4737 4738 vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19 4739 vneg.w vr0, vr0 4740 vneg.w vr19, vr19 4741 vssrarni.h.w vr19, vr0, 12 // t10a 4742 vmul_vmsub_w vr25, vr29, vr20, vr21, vr0, vr27 4743 vssrarni.h.w vr27, vr0, 12 // t13a 4744 4745 vsadd.h vr25, vr28, vr9 // t8a 4746 vssub.h vr29, vr28, vr9 // t11a 4747 vssub.h vr28, vr24, vr31 // t12a 4748 vsadd.h vr10, vr24, vr31 // t15a 4749 vsadd.h vr9, vr30, vr19 // t9 4750 vssub.h vr31, vr30, vr19 // t10 4751 vssub.h vr30, vr26, vr27 // t13 4752 vsadd.h vr24, vr26, vr27 // t14 4753 4754 vldrepl.w vr20, t0, 0 // 2896 4755 vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26 4756 vssrarni.h.w vr26, vr0, 12 // t13a 4757 vmul_vmsub_w vr30, vr31, vr20, vr20, vr0, vr27 4758 vssrarni.h.w vr27, vr0, 12 // t10a 4759 4760 vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31 4761 vssrarni.h.w vr31, vr0, 12 // t12 4762 vmul_vmsub_w vr28, vr29, vr20, vr20, vr0, vr30 4763 vssrarni.h.w vr30, vr0, 12 // t11 4764 4765 // vr11 vr12 ... vr18 4766 vsadd.h vr28, vr14, vr31 // c[3] 4767 vssub.h vr29, vr14, vr31 // c[12] 4768 vsadd.h vr20, vr15, vr30 // c[4] 4769 vssub.h vr21, vr15, vr30 // c[11] 4770 vsadd.h vr14, vr16, vr27 // c[5] 4771 vssub.h vr23, vr16, vr27 // c[10] 4772 vsadd.h vr15, vr17, vr9 // c[6] 4773 vssub.h vr30, vr17, vr9 // c[9] 4774 vsadd.h vr16, vr18, vr25 // c[7] 4775 vssub.h vr27, vr18, vr25 // c[8] 4776 vsadd.h vr17, vr13, vr26 // c[2] 4777 vssub.h vr26, vr13, vr26 // c[13] 4778 vsadd.h vr18, vr12, vr24 // c[1] 4779 vssub.h vr25, vr12, vr24 // c[14] 4780 vsadd.h vr22, vr11, vr10 // c[0] 4781 vssub.h vr24, vr11, vr10 // c[15] 4782 4783.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 4784 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 4785 vsrari.h \i, \i, 4 4786.endr 4787 4788 alsl.d t2, a1, a0, 1 4789 4790 VLD_DST_ADD_W8 vr22, vr18, vr17, vr28 4791 4792 alsl.d a0, a1, a0, 2 4793 alsl.d t2, a1, a0, 1 4794 4795 VLD_DST_ADD_W8 vr20, vr14, vr15, vr16 4796 4797 alsl.d a0, a1, a0, 2 4798 alsl.d t2, a1, a0, 1 4799 4800 VLD_DST_ADD_W8 vr27, vr30, vr23, vr21 4801 4802 alsl.d a0, a1, a0, 2 4803 alsl.d t2, a1, a0, 1 4804 4805 VLD_DST_ADD_W8 vr29, vr26, vr25, vr24 4806 4807 fld.d f24, sp, 0 4808 fld.d f25, sp, 8 4809 fld.d f26, sp, 16 4810 fld.d f27, sp, 24 4811 fld.d f28, sp, 32 4812 fld.d f29, sp, 40 4813 fld.d f30, sp, 48 4814 fld.d f31, sp, 56 4815 addi.d sp, sp, 64 4816endfunc 4817 4818const iadst16_coeffs, align=4 4819 .word 4091, 201, 3973, 995 4820 .word 3703, 1751, 3290, 2440 4821 .word 2751, 3035, 2106, 3513 4822 .word 1380, 3857, 601, 4052 4823endconst 4824 4825.macro adst16_core_lsx transpose8x8, shift, vst 4826 la.local t0, iadst16_coeffs 4827 vldrepl.w vr20, t0, 0 // 4091 4828 vldrepl.w vr21, t0, 4 // 201 4829 4830 vmul_vmadd_w vr15, vr0, vr20, vr21, vr16, vr18 4831 vmul_vmsub_w vr15, vr0, vr21, vr20, vr17, vr19 4832 vssrarni.h.w vr18, vr16, 12 // t0 4833 vssrarni.h.w vr19, vr17, 12 // t1 4834 4835 vldrepl.w vr20, t0, 8 // 3973 4836 vldrepl.w vr21, t0, 12 // 995 4837 vmul_vmadd_w vr13, vr2, vr20, vr21, vr16, vr0 4838 vmul_vmsub_w vr13, vr2, vr21, vr20, vr17, vr15 4839 vssrarni.h.w vr0, vr16, 12 // t2 4840 vssrarni.h.w vr15, vr17, 12 // t3 4841 4842 vldrepl.w vr20, t0, 16 // 3703 4843 vldrepl.w vr21, t0, 20 // 1751 4844 vmul_vmadd_w vr11, vr4, vr20, vr21, vr16, vr2 4845 vmul_vmsub_w vr11, vr4, vr21, vr20, vr17, vr13 4846 vssrarni.h.w vr2, vr16, 12 // t4 4847 vssrarni.h.w vr13, vr17, 12 // t5 4848 4849 vldrepl.w vr20, t0, 24 // 3290 -> 1645 4850 vldrepl.w vr21, t0, 28 // 2440 -> 1220 4851 vmul_vmadd_w vr9, vr6, vr20, vr21, vr16, vr4 4852 vmul_vmsub_w vr9, vr6, vr21, vr20, vr17, vr11 4853 vssrarni.h.w vr4, vr16, 12 // t6 4854 vssrarni.h.w vr11, vr17, 12 // t7 4855 4856 vldrepl.w vr20, t0, 32 // 2751 4857 vldrepl.w vr21, t0, 36 // 3035 4858 vmul_vmadd_w vr7, vr8, vr20, vr21, vr16, vr6 4859 vmul_vmsub_w vr7, vr8, vr21, vr20, vr17, vr9 4860 vssrarni.h.w vr6, vr16, 12 // t8 4861 vssrarni.h.w vr9, vr17, 12 // t9 4862 4863 vldrepl.w vr20, t0, 40 // 2106 4864 vldrepl.w vr21, t0, 44 // 3513 4865 vmul_vmadd_w vr5, vr10, vr20, vr21, vr16, vr7 4866 vmul_vmsub_w vr5, vr10, vr21, vr20, vr17, vr8 4867 vssrarni.h.w vr7, vr16, 12 // t10 4868 vssrarni.h.w vr8, vr17, 12 // t11 4869 4870 vldrepl.w vr20, t0, 48 // 1380 4871 vldrepl.w vr21, t0, 52 // 3857 4872 vmul_vmadd_w vr3, vr12, vr20, vr21, vr16, vr5 4873 vmul_vmsub_w vr3, vr12, vr21, vr20, vr17, vr10 4874 vssrarni.h.w vr5, vr16, 12 // t12 4875 vssrarni.h.w vr10, vr17, 12 // t13 4876 4877 vldrepl.w vr20, t0, 56 // 601 4878 vldrepl.w vr21, t0, 60 // 4052 4879 vmul_vmadd_w vr1, vr14, vr20, vr21, vr16, vr3 4880 vmul_vmsub_w vr1, vr14, vr21, vr20, vr17, vr12 4881 vssrarni.h.w vr3, vr16, 12 // t14 4882 vssrarni.h.w vr12, vr17, 12 // t15 4883 4884 vsadd.h vr1, vr18, vr6 // t0a 4885 vssub.h vr14, vr18, vr6 // t8a 4886 vsadd.h vr16, vr19, vr9 // t1a 4887 vssub.h vr17, vr19, vr9 // t9a 4888 vsadd.h vr6, vr0, vr7 // t2a 4889 vssub.h vr18, vr0, vr7 // t10a 4890 vsadd.h vr9, vr15, vr8 // t3a 4891 vssub.h vr19, vr15, vr8 // t11a 4892 vsadd.h vr0, vr2, vr5 // t4a 4893 vssub.h vr7, vr2, vr5 // t12a 4894 vsadd.h vr8, vr13, vr10 // t5a 4895 vssub.h vr15, vr13, vr10 // t13a 4896 vsadd.h vr2, vr4, vr3 // t6a 4897 vssub.h vr5, vr4, vr3 // t14a 4898 vsadd.h vr10, vr11, vr12 // t7a 4899 vssub.h vr13, vr11, vr12 // t15a 4900 4901 la.local t0, idct_coeffs 4902 4903 vldrepl.w vr20, t0, 16 // 799 4904 vldrepl.w vr21, t0, 20 // 4017 4905 vmul_vmadd_w vr14, vr17, vr21, vr20, vr3, vr11 4906 vmul_vmsub_w vr14, vr17, vr20, vr21, vr4, vr12 4907 vssrarni.h.w vr11, vr3, 12 // t8 4908 vssrarni.h.w vr12, vr4, 12 // t9 4909 4910 vmul_vmadd_w vr15, vr7, vr20, vr21, vr3, vr14 4911 vmul_vmsub_w vr15, vr7, vr21, vr20, vr4, vr17 4912 vssrarni.h.w vr14, vr3, 12 // t13 4913 vssrarni.h.w vr17, vr4, 12 // t12 4914 4915 vldrepl.w vr20, t0, 24 // 3406 4916 vldrepl.w vr21, t0, 28 // 2276 4917 vmul_vmadd_w vr18, vr19, vr21, vr20, vr3, vr7 4918 vmul_vmsub_w vr18, vr19, vr20, vr21, vr4, vr15 4919 vssrarni.h.w vr7, vr3, 12 // t10 4920 vssrarni.h.w vr15, vr4, 12 // t11 4921 4922 vmul_vmadd_w vr13, vr5, vr20, vr21, vr3, vr18 4923 vmul_vmsub_w vr13, vr5, vr21, vr20, vr4, vr19 4924 vssrarni.h.w vr18, vr3, 12 // t15 4925 vssrarni.h.w vr19, vr4, 12 // t14 4926 4927 vsadd.h vr5, vr1, vr0 // t0 4928 vssub.h vr13, vr1, vr0 // t4 4929 vsadd.h vr3, vr16, vr8 // t1 4930 vssub.h vr4, vr16, vr8 // t5 4931 vsadd.h vr0, vr6, vr2 // t2 4932 vssub.h vr1, vr6, vr2 // t6 4933 vsadd.h vr8, vr9, vr10 // t3 4934 vssub.h vr16, vr9, vr10 // t7 4935 vsadd.h vr2, vr11, vr17 // t8a 4936 vssub.h vr6, vr11, vr17 // t12a 4937 vsadd.h vr9, vr12, vr14 // t9a 4938 vssub.h vr10, vr12, vr14 // t13a 4939 vsadd.h vr11, vr7, vr19 // t10a 4940 vssub.h vr17, vr7, vr19 // t14a 4941 vsadd.h vr12, vr15, vr18 // t11a 4942 vssub.h vr14, vr15, vr18 // t15a 4943 4944 la.local t0, idct_coeffs 4945 4946 vldrepl.w vr20, t0, 8 // 1567 4947 vldrepl.w vr21, t0, 12 // 3784 4948 vmul_vmadd_w vr13, vr4, vr21, vr20, vr7, vr18 4949 vmul_vmsub_w vr13, vr4, vr20, vr21, vr15, vr19 4950 vssrarni.h.w vr18, vr7, 12 // t4a 4951 vssrarni.h.w vr19, vr15, 12 // t5a 4952 4953 vmul_vmadd_w vr16, vr1, vr20, vr21, vr7, vr4 4954 vmul_vmsub_w vr16, vr1, vr21, vr20, vr15, vr13 4955 vssrarni.h.w vr4, vr7, 12 // t7a 4956 vssrarni.h.w vr13, vr15, 12 // t6a 4957 4958 vmul_vmadd_w vr6, vr10, vr21, vr20, vr7, vr1 4959 vmul_vmsub_w vr6, vr10, vr20, vr21, vr15, vr16 4960 vssrarni.h.w vr1, vr7, 12 // t12 4961 vssrarni.h.w vr16, vr15, 12 // t13 4962 4963 vmul_vmadd_w vr14, vr17, vr20, vr21, vr7, vr6 4964 vmul_vmsub_w vr14, vr17, vr21, vr20, vr15, vr10 4965 vssrarni.h.w vr6, vr7, 12 // t15 4966 vssrarni.h.w vr10, vr15, 12 // t14 4967 4968 vsadd.h vr14, vr5, vr0 // out[0] 4969 vssub.h vr17, vr5, vr0 // t2a 4970 vssub.h vr7, vr3, vr8 // t3a 4971 vsadd.h vr15, vr3, vr8 // out[15] 4972 vsllwil.w.h vr22, vr15, 0 4973 vexth.w.h vr15, vr15 4974 vneg.w vr22, vr22 4975 vneg.w vr15, vr15 4976 vssrarni.h.w vr15, vr22, 0 // out[15] 4977 vsadd.h vr14, vr5, vr0 // out[0] 4978 vssub.h vr17, vr5, vr0 // t2a 4979 vssub.h vr7, vr3, vr8 // t3a 4980 4981 vsadd.h vr3, vr19, vr4 // out[12] 4982 vssub.h vr8, vr19, vr4 // t7 4983 vssub.h vr0, vr18, vr13 // t6 4984 vsadd.h vr5, vr18, vr13 // out[3] 4985 vsllwil.w.h vr22, vr5, 0 4986 vexth.w.h vr5, vr5 4987 vneg.w vr22, vr22 4988 vneg.w vr5, vr5 4989 vssrarni.h.w vr5, vr22, 0 // out[3] 4990 4991 vsadd.h vr13, vr9, vr12 // out[14] 4992 vssub.h vr19, vr9, vr12 // t11 4993 vssub.h vr4, vr2, vr11 // t10 4994 vsadd.h vr18, vr2, vr11 // out[1] 4995 vsllwil.w.h vr22, vr18, 0 4996 vexth.w.h vr18, vr18 4997 vneg.w vr22, vr22 4998 vneg.w vr18, vr18 4999 vssrarni.h.w vr18, vr22, 0 // out[1] 5000 5001 vsadd.h vr2, vr1, vr10 // out[2] 5002 vssub.h vr11, vr1, vr10 // t14a 5003 vssub.h vr12, vr16, vr6 // t15a 5004 vsadd.h vr9, vr16, vr6 // out[13] 5005 vsllwil.w.h vr22, vr9, 0 5006 vexth.w.h vr9, vr9 5007 vneg.w vr22, vr22 5008 vneg.w vr9, vr9 5009 vssrarni.h.w vr9, vr22, 0 // out[13] 5010 5011 vldrepl.w vr20, t0, 0 // 2896 5012 vmul_vmadd_w vr17, vr7, vr20, vr20, vr6, vr10 5013 vmul_vmsub_w vr17, vr7, vr20, vr20, vr16, vr1 5014 vssrarni.h.w vr10, vr6, 12 // out[7] 5015 5016 vsllwil.w.h vr7, vr10, 0 5017 vexth.w.h vr10, vr10 5018 vneg.w vr7, vr7 5019 vneg.w vr10, vr10 5020 vssrarni.h.w vr10, vr7, 0 5021 vssrarni.h.w vr1, vr16, 12 // out[8] 5022 5023 vmul_vmsub_w vr0, vr8, vr20, vr20, vr16, vr17 5024 vmul_vmadd_w vr0, vr8, vr20, vr20, vr6, vr7 5025 vssrarni.h.w vr17, vr16, 12 // out[11] 5026 5027 vsllwil.w.h vr0, vr17, 0 5028 vexth.w.h vr17, vr17 5029 vneg.w vr0, vr0 5030 vneg.w vr17, vr17 5031 vssrarni.h.w vr17, vr0, 0 5032 vssrarni.h.w vr7, vr6, 12 // out[4] 5033 5034 vmul_vmsub_w vr4, vr19, vr20, vr20, vr16, vr0 5035 vmul_vmadd_w vr4, vr19, vr20, vr20, vr6, vr8 5036 vssrarni.h.w vr0, vr16, 12 // out[9] 5037 5038 vsllwil.w.h vr4, vr0, 0 5039 vexth.w.h vr0, vr0 5040 vneg.w vr4, vr4 5041 vneg.w vr0, vr0 5042 vssrarni.h.w vr0, vr4, 0 5043 vssrarni.h.w vr8, vr6, 12 // out[6] 5044 5045 vmul_vmadd_w vr11, vr12, vr20, vr20, vr6, vr4 5046 vmul_vmsub_w vr11, vr12, vr20, vr20, vr16, vr19 5047 vssrarni.h.w vr4, vr6, 12 // out[5] 5048 5049 vsllwil.w.h vr24, vr4, 0 5050 vexth.w.h vr4, vr4 5051 vneg.w vr24, vr24 5052 vneg.w vr4, vr4 5053 vssrarni.h.w vr4, vr24, 0 5054 vssrarni.h.w vr19, vr16, 12 // out[10] 5055 5056.ifnb \transpose8x8 5057 LSX_TRANSPOSE8x8_H vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ 5058 vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ 5059 vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23 5060 5061 LSX_TRANSPOSE8x8_H vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \ 5062 vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \ 5063 vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23 5064.endif 5065 5066.ifnb \shift 5067.irp i, vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ 5068 vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 5069 vsrari.h \i, \i, \shift 5070.endr 5071.endif 5072 5073.ifnb \vst 5074 vst_x16 t1, 0, 16, vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ 5075 vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 5076.endif 5077// out0 out1 out2 out3 out4 out5 out6 out7 5078// vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 5079// out8 out9 out10 out11 out12 out13 out14 out15 5080// vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 5081.endm // adst16_core_lsx 5082 5083.macro adst16_core_finish_lsx in0, in1, in2, in3, in4, in5, in6, in7 5084 fld.d f20, t2, 0 5085 fldx.d f21, t2, a1 5086 fld.d f22, t3, 0 5087 fldx.d f23, t3, a1 5088 5089 alsl.d t2, a1, t2, 2 5090 alsl.d t3, a1, t3, 2 5091 5092 fld.d f24, t2, 0 5093 fldx.d f25, t2, a1 5094 fld.d f26, t3, 0 5095 fldx.d f27, t3, a1 5096 5097.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27 5098 vsllwil.hu.bu \i, \i, 0 5099.endr 5100 5101.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 5102 vsrari.h \i, \i, 4 5103.endr 5104 5105 vadd.h vr20, vr20, \in0 5106 vadd.h vr21, vr21, \in1 5107 vadd.h vr22, vr22, \in2 5108 vadd.h vr23, vr23, \in3 5109 vadd.h vr24, vr24, \in4 5110 vadd.h vr25, vr25, \in5 5111 vadd.h vr26, vr26, \in6 5112 vadd.h vr27, vr27, \in7 5113 5114 vssrani.bu.h vr21, vr20, 0 5115 vssrani.bu.h vr23, vr22, 0 5116 vssrani.bu.h vr25, vr24, 0 5117 vssrani.bu.h vr27, vr26, 0 5118 5119 vstelm.d vr21, t4, 0, 0 5120 vstelm.d vr21, t5, 0, 1 5121 5122 alsl.d t4, a1, t4, 1 5123 alsl.d t5, a1, t5, 1 5124 vstelm.d vr23, t4, 0, 0 5125 vstelm.d vr23, t5, 0, 1 5126 5127 alsl.d t4, a1, t4, 1 5128 alsl.d t5, a1, t5, 1 5129 vstelm.d vr25, t4, 0, 0 5130 vstelm.d vr25, t5, 0, 1 5131 5132 alsl.d t4, a1, t4, 1 5133 alsl.d t5, a1, t5, 1 5134 vstelm.d vr27, t4, 0, 0 5135 vstelm.d vr27, t5, 0, 1 5136 5137.endm // adst16_core_finish_lsx 5138 5139function inv_txfm_add_dct_adst_8x16_8bpc_lsx 5140 addi.d sp, sp, -64 5141 fst.d f24, sp, 0 5142 fst.d f25, sp, 8 5143 fst.d f26, sp, 16 5144 fst.d f27, sp, 24 5145 fst.d f28, sp, 32 5146 fst.d f29, sp, 40 5147 fst.d f30, sp, 48 5148 fst.d f31, sp, 56 5149 5150 vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5151 5152 la.local t0, idct_coeffs 5153 5154 dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5155 vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx 5156 5157 vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5158 5159 dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5160 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx 5161 5162.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 5163 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 5164 vsrari.h \i, \i, 1 5165.endr 5166 5167 vreplgr2vr.h vr23, zero 5168 5169.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 5170 vst vr23, a2, \i 5171.endr 5172 5173 LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 5174 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5175 vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 5176 5177 LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ 5178 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ 5179 vr16, vr17, vr18, vr20, vr21, vr22, vr23, vr31 5180 5181 adst16_core_lsx , , 5182 5183 addi.d t2, a0, 0 5184 alsl.d t3, a1, a0, 1 5185 addi.d t4, a0, 0 5186 add.d t5, a1, a0 5187 5188 adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 5189 5190 alsl.d t2, a1, t2, 2 5191 alsl.d t3, a1, t3, 2 5192 5193 alsl.d t4, a1, t4, 1 5194 alsl.d t5, a1, t5, 1 5195 5196 adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 5197 5198 fld.d f24, sp, 0 5199 fld.d f25, sp, 8 5200 fld.d f26, sp, 16 5201 fld.d f27, sp, 24 5202 fld.d f28, sp, 32 5203 fld.d f29, sp, 40 5204 fld.d f30, sp, 48 5205 fld.d f31, sp, 56 5206 addi.d sp, sp, 64 5207endfunc 5208 5209.macro malloc_space number 5210 li.w t0, \number 5211 sub.d sp, sp, t0 5212 addi.d sp, sp, -64 5213 fst.d f24, sp, 0 5214 fst.d f25, sp, 8 5215 fst.d f26, sp, 16 5216 fst.d f27, sp, 24 5217 fst.d f28, sp, 32 5218 fst.d f29, sp, 40 5219 fst.d f30, sp, 48 5220 fst.d f31, sp, 56 5221.endm 5222 5223.macro free_space number 5224 fld.d f24, sp, 0 5225 fld.d f25, sp, 8 5226 fld.d f26, sp, 16 5227 fld.d f27, sp, 24 5228 fld.d f28, sp, 32 5229 fld.d f29, sp, 40 5230 fld.d f30, sp, 48 5231 fld.d f31, sp, 56 5232 li.w t0, \number 5233 add.d sp, sp, t0 5234 addi.d sp, sp, 64 5235.endm 5236 5237.macro DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11 5238 vsllwil.hu.bu vr10, \in0, 0 5239 vexth.hu.bu vr0, \in0 5240 vsllwil.hu.bu vr11, \in1, 0 5241 vexth.hu.bu vr1, \in1 5242 vsllwil.hu.bu vr12, \in2, 0 5243 vexth.hu.bu vr2, \in2 5244 vsllwil.hu.bu vr13, \in3, 0 5245 vexth.hu.bu vr3, \in3 5246 vadd.h vr10, vr10, \in4 5247 vadd.h vr0, vr0, \in5 5248 vadd.h vr11, vr11, \in6 5249 vadd.h vr1, vr1, \in7 5250 vadd.h vr12, vr12, \in8 5251 vadd.h vr2, vr2, \in9 5252 vadd.h vr13, vr13, \in10 5253 vadd.h vr3, vr3, \in11 5254 vssrani.bu.h vr0, vr10, 0 5255 vssrani.bu.h vr1, vr11, 0 5256 vssrani.bu.h vr2, vr12, 0 5257 vssrani.bu.h vr3, vr13, 0 5258 vst vr0, a0, 0 5259 vstx vr1, a0, a1 5260 vst vr2, t2, 0 5261 vstx vr3, t2, a1 5262.endm 5263 5264.macro VLD_DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, shift 5265 5266.ifnb \shift 5267.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 5268 vsrari.h \i, \i, \shift 5269.endr 5270.endif 5271 5272 vld vr0, a0, 0 5273 vldx vr1, a0, a1 5274 vld vr2, t2, 0 5275 vldx vr3, t2, a1 5276 DST_ADD_W16 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3, \ 5277 \in4, \in5, \in6, \in7 5278.endm 5279 5280function inv_txfm_add_dct_dct_16x8_8bpc_lsx 5281 bnez a3, .NO_HAS_DCONLY_16x8 5282 5283 ld.h t2, a2, 0 // dc 5284 vldi vr0, 0x8b5 // 181 5285 vreplgr2vr.w vr1, t2 5286 vldi vr5, 0x880 // 128 5287 vmul.w vr2, vr0, vr1 // dc * 181 5288 st.h zero, a2, 0 5289 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 5290 alsl.d t2, a1, a0, 1 5291 vmul.w vr2, vr2, vr0 5292 vldx vr1, a0, a1 5293 vsrari.w vr2, vr2, 8 5294 vldx vr3, t2, a1 5295 vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift 5296 vmadd.w vr5, vr2, vr0 5297 vld vr0, a0, 0 5298 vssrarni.h.w vr5, vr5, 12 5299 vld vr2, t2, 0 5300 5301 DST_ADD_W16 vr0, vr1, vr2, vr3, vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5 5302 5303 alsl.d a0, a1, a0, 2 5304 alsl.d t2, a1, a0, 1 5305 5306 VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5, 5307 5308 b .DCT_DCT_16x8_END 5309 5310.NO_HAS_DCONLY_16x8: 5311 malloc_space 512 5312 5313 vld_x16 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5314 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 5315 5316 la.local t0, idct_coeffs 5317 5318 vldrepl.w vr23, t0, 0 //2896 5319.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5320 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 5321 rect2_lsx \i, vr23, \i 5322.endr 5323 5324 dct_8x16_core_lsx 5325 5326 LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5327 vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \ 5328 vr13, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5329 5330 LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ 5331 vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24, \ 5332 vr13, vr31, vr2, vr3, vr4, vr5, vr6, vr7 5333 5334.irp i, vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \ 5335 vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24 5336 vsrari.h \i, \i, 1 5337.endr 5338 5339 vst_x16 sp, 64, 16, vr13, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \ 5340 vr27, vr30, vr23, vr12, vr29, vr26, vr25, vr24 5341 5342 vreplgr2vr.h vr23, zero 5343.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 5344 vst vr23, a2, \i 5345.endr 5346 5347 dct_8x8_core_lsx vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \ 5348 vr4, vr5, vr6, vr16, vr7, vr18, vr19, vr31, no_rect2 5349 5350 dct_8x8_core_lsx vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24, \ 5351 vr14, vr15, vr17, vr20, vr21, vr22, vr23, vr28, no_rect2 5352 5353 alsl.d t2, a1, a0, 1 5354 VLD_DST_ADD_W16 vr4, vr14, vr5, vr15, vr6, vr17, vr16, vr20, 4 5355 5356 alsl.d a0, a1, a0, 2 5357 alsl.d t2, a1, a0, 1 5358 VLD_DST_ADD_W16 vr7, vr21, vr18, vr22, vr19, vr23, vr31, vr28, 4 5359 5360 free_space 512 5361 5362.DCT_DCT_16x8_END: 5363 5364endfunc 5365 5366function inv_txfm_add_adst_dct_16x8_8bpc_lsx 5367 addi.d sp, sp, -64 5368 fst.d f24, sp, 0 5369 fst.d f25, sp, 8 5370 fst.d f26, sp, 16 5371 fst.d f27, sp, 24 5372 fst.d f28, sp, 32 5373 fst.d f29, sp, 40 5374 fst.d f30, sp, 48 5375 fst.d f31, sp, 56 5376 5377 addi.d t1, sp, 64 5378 addi.d t2, a2, 0 5379 5380 vld_x16 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5381 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 5382 5383 la.local t0, idct_coeffs 5384 5385 vldrepl.w vr23, t0, 0 //2896 5386.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5387 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 5388 rect2_lsx \i, vr23, \i 5389.endr 5390 5391 adst16_core_lsx , 1, 5392 5393 // out0 out1 out2 out3 out4 out5 out6 out7 5394 // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 5395 // out8 out9 out10 out11 out12 out13 out14 out15 5396 // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 5397 5398 LSX_TRANSPOSE8x8_H vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ 5399 vr14, vr18, vr2, vr5, vr7, vr4, vr24, vr25, \ 5400 vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23 5401 5402 LSX_TRANSPOSE8x8_H vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \ 5403 vr1, vr0, vr19, vr17, vr3, vr26, vr13, vr15, \ 5404 vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23 5405 5406 vreplgr2vr.h vr23, zero 5407.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 5408 vst vr23, a2, \i 5409.endr 5410 5411 dct_8x8_core_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr24, vr25, \ 5412 vr27, vr28, vr29, vr25, vr30, vr31, vr6, vr16, no_rect2 5413 5414 dct_8x8_core_lsx vr1, vr0, vr19, vr17, vr3, vr26, vr13, vr15, \ 5415 vr5, vr7, vr18, vr20, vr21, vr22, vr23, vr24, no_rect2 5416 5417 alsl.d t2, a1, a0, 1 5418 VLD_DST_ADD_W16 vr27, vr5, vr28, vr7, vr29, vr18, vr25, vr20, 4 5419 5420 alsl.d a0, a1, a0, 2 5421 alsl.d t2, a1, a0, 1 5422 VLD_DST_ADD_W16 vr30, vr21, vr31, vr22, vr6, vr23, vr16, vr24, 4 5423 5424 fld.d f24, sp, 0 5425 fld.d f25, sp, 8 5426 fld.d f26, sp, 16 5427 fld.d f27, sp, 24 5428 fld.d f28, sp, 32 5429 fld.d f29, sp, 40 5430 fld.d f30, sp, 48 5431 fld.d f31, sp, 56 5432 addi.d sp, sp, 64 5433endfunc 5434 5435function inv_txfm_add_dct_dct_16x16_8bpc_lsx 5436 bnez a3, .NO_HAS_DCONLY_16x16 5437 5438 ld.h t2, a2, 0 // dc 5439 vldi vr0, 0x8b5 // 181 5440 vreplgr2vr.w vr1, t2 5441 vldi vr5, 0x880 // 128 5442 vmul.w vr2, vr0, vr1 // dc * 181 5443 st.h zero, a2, 0 5444 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 5445 alsl.d t2, a1, a0, 1 5446 vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift 5447 vldx vr1, a0, a1 5448 vmadd.w vr5, vr2, vr0 5449 vldx vr3, t2, a1 5450 vssrarni.h.w vr5, vr5, 12 5451 vld vr0, a0, 0 5452 vld vr2, t2, 0 5453 5454 DST_ADD_W16 vr0, vr1, vr2, vr3, vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5 5455 5456 alsl.d a0, a1, a0, 2 5457 alsl.d t2, a1, a0, 1 5458 5459 VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5, 5460 5461 alsl.d a0, a1, a0, 2 5462 alsl.d t2, a1, a0, 1 5463 5464 VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5, 5465 5466 alsl.d a0, a1, a0, 2 5467 alsl.d t2, a1, a0, 1 5468 5469 VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5, 5470 5471 b .DCT_DCT_16x16_END 5472 5473.NO_HAS_DCONLY_16x16: 5474 5475 malloc_space 512 5476 5477 vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5478 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 5479 5480 dct_8x16_core_lsx 5481 5482 LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5483 vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5484 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5485 5486 LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ 5487 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ 5488 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5489 5490.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5491 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5492 vsrari.h \i, \i, 2 5493.endr 5494 5495 vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5496 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5497 5498 vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5499 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 5500 5501 dct_8x16_core_lsx 5502 5503 LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5504 vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5505 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5506 5507 LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ 5508 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ 5509 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5510 5511.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5512 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5513 vsrari.h \i, \i, 2 5514.endr 5515 5516 vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5517 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5518 5519 vreplgr2vr.h vr31, zero 5520 5521.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 5522 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ 5523 464, 480, 496 5524 vst vr31, a2, \i 5525.endr 5526 5527 vld_x8 sp, 64, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5528 vld_x8 sp, 320, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 5529 5530 dct_8x16_core_lsx 5531 5532 vst_x8 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16 5533 vst_x8 sp, 320, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5534 5535 vld_x8 sp, 192, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5536 vld_x8 sp, 448, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 5537 5538 dct_8x16_core_lsx 5539 5540 alsl.d t2, a1, a0, 1 5541 vld vr4, sp, 64 5542 vld vr5, sp, 80 5543 vld vr6, sp, 96 5544 vld vr7, sp, 112 5545 VLD_DST_ADD_W16 vr4, vr22, vr5, vr18, vr6, vr17, vr7, vr28, 4 5546 5547 alsl.d a0, a1, a0, 2 5548 alsl.d t2, a1, a0, 1 5549 vld vr4, sp, 128 5550 vld vr5, sp, 144 5551 vld vr6, sp, 160 5552 vld vr7, sp, 176 5553 VLD_DST_ADD_W16 vr4, vr20, vr5, vr14, vr6, vr15, vr7, vr16, 4 5554 5555 alsl.d a0, a1, a0, 2 5556 alsl.d t2, a1, a0, 1 5557 vld vr4, sp, 320 5558 vld vr5, sp, 336 5559 vld vr6, sp, 352 5560 vld vr7, sp, 368 5561 VLD_DST_ADD_W16 vr4, vr27, vr5, vr30, vr6, vr23, vr7, vr21, 4 5562 5563 alsl.d a0, a1, a0, 2 5564 alsl.d t2, a1, a0, 1 5565 vld vr4, sp, 384 5566 vld vr5, sp, 400 5567 vld vr6, sp, 416 5568 vld vr7, sp, 432 5569 VLD_DST_ADD_W16 vr4, vr29, vr5, vr26, vr6, vr25, vr7, vr24, 4 5570 5571 free_space 512 5572 5573.DCT_DCT_16x16_END: 5574endfunc 5575 5576function inv_txfm_add_adst_adst_16x16_8bpc_lsx 5577 5578 malloc_space 256+256 5579 5580 addi.d t1, sp, 64 5581 addi.d t2, a2, 0 5582 5583 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5584 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 5585 5586 adst16_core_lsx transpose8x8, 2, vst_x16 5587 5588 addi.d t2, a2, 16 5589 addi.d t1, t1, 256 5590 5591 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5592 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 5593 5594 adst16_core_lsx transpose8x8, 2, vst_x16 5595 5596 vreplgr2vr.h vr23, zero 5597 5598.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 5599 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ 5600 464, 480, 496 5601 vst vr23, a2, \i 5602.endr 5603 5604 addi.d t2, sp, 64 5605 5606 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5607 vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 5608 5609 adst16_core_lsx , , 5610 5611 // out0 out1 out2 out3 out4 out5 out6 out7 5612 // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 5613 // out8 out9 out10 out11 out12 out13 out14 out15 5614 // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 5615 5616 addi.d t2, a0, 0 5617 alsl.d t3, a1, a0, 1 5618 addi.d t4, a0, 0 5619 add.d t5, a1, a0 5620 5621 adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 5622 5623 alsl.d t2, a1, t2, 2 5624 alsl.d t3, a1, t3, 2 5625 5626 alsl.d t4, a1, t4, 1 5627 alsl.d t5, a1, t5, 1 5628 5629 adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 5630 5631 addi.d t2, sp, 64+128 5632 5633 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5634 vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 5635 5636 adst16_core_lsx , , 5637 5638 addi.d a0, a0, 8 5639 5640 addi.d t2, a0, 0 5641 alsl.d t3, a1, a0, 1 5642 addi.d t4, a0, 0 5643 add.d t5, a1, a0 5644 5645 adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 5646 5647 alsl.d t2, a1, t2, 2 5648 alsl.d t3, a1, t3, 2 5649 5650 alsl.d t4, a1, t4, 1 5651 alsl.d t5, a1, t5, 1 5652 5653 adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 5654 5655 free_space 256+256 5656endfunc 5657 5658function inv_txfm_add_adst_dct_16x16_8bpc_lsx 5659 malloc_space 256+256 5660 5661 addi.d t1, sp, 64 5662 addi.d t2, a2, 0 5663 5664 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5665 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 5666 5667 adst16_core_lsx transpose8x8, 2, vst_x16 5668 5669 addi.d t2, a2, 16 5670 addi.d t1, t1, 256 5671 5672 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5673 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 5674 5675 adst16_core_lsx transpose8x8, 2, vst_x16 5676 5677 vreplgr2vr.h vr23, zero 5678.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 5679 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ 5680 464, 480, 496 5681 vst vr23, a2, \i 5682.endr 5683 5684 addi.d t2, sp, 64 5685 5686 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5687 vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 5688 5689 dct_8x16_core_lsx 5690 5691 vst_x8 t2, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16 5692 vst_x8 t2, 256, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5693 5694 addi.d t2, sp, 64+128 5695 5696 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5697 vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 5698 5699 dct_8x16_core_lsx 5700 5701 alsl.d t2, a1, a0, 1 5702 vld vr4, sp, 64 5703 vld vr5, sp, 80 5704 vld vr6, sp, 96 5705 vld vr7, sp, 112 5706 VLD_DST_ADD_W16 vr4, vr22, vr5, vr18, vr6, vr17, vr7, vr28, 4 5707 5708 alsl.d a0, a1, a0, 2 5709 alsl.d t2, a1, a0, 1 5710 vld vr4, sp, 128 5711 vld vr5, sp, 144 5712 vld vr6, sp, 160 5713 vld vr7, sp, 176 5714 VLD_DST_ADD_W16 vr4, vr20, vr5, vr14, vr6, vr15, vr7, vr16, 4 5715 5716 alsl.d a0, a1, a0, 2 5717 alsl.d t2, a1, a0, 1 5718 vld vr4, sp, 320 5719 vld vr5, sp, 336 5720 vld vr6, sp, 352 5721 vld vr7, sp, 368 5722 VLD_DST_ADD_W16 vr4, vr27, vr5, vr30, vr6, vr23, vr7, vr21, 4 5723 5724 alsl.d a0, a1, a0, 2 5725 alsl.d t2, a1, a0, 1 5726 vld vr4, sp, 384 5727 vld vr5, sp, 400 5728 vld vr6, sp, 416 5729 vld vr7, sp, 432 5730 VLD_DST_ADD_W16 vr4, vr29, vr5, vr26, vr6, vr25, vr7, vr24, 4 5731 5732 free_space 256+256 5733endfunc 5734 5735function inv_txfm_add_dct_adst_16x16_8bpc_lsx 5736 malloc_space 256+256 5737 5738 vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5739 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 5740 5741 dct_8x16_core_lsx 5742 5743 LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5744 vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5745 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5746 5747 LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ 5748 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ 5749 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5750 5751.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5752 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5753 vsrari.h \i, \i, 2 5754.endr 5755 5756 vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5757 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5758 5759 vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5760 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 5761 5762 dct_8x16_core_lsx 5763 5764 LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5765 vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5766 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5767 5768 LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ 5769 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ 5770 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5771 5772.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5773 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5774 vsrari.h \i, \i, 2 5775.endr 5776 5777 vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5778 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5779 5780 vreplgr2vr.h vr31, zero 5781 5782.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 5783 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ 5784 464, 480, 496 5785 vst vr31, a2, \i 5786.endr 5787 5788 addi.d t2, sp, 64 5789 5790 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5791 vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 5792 5793 adst16_core_lsx , , 5794 5795 // out0 out1 out2 out3 out4 out5 out6 out7 5796 // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 5797 // out8 out9 out10 out11 out12 out13 out14 out15 5798 // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 5799 5800 addi.d t2, a0, 0 5801 alsl.d t3, a1, a0, 1 5802 addi.d t4, a0, 0 5803 add.d t5, a1, a0 5804 5805 adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 5806 5807 alsl.d t2, a1, t2, 2 5808 alsl.d t3, a1, t3, 2 5809 5810 alsl.d t4, a1, t4, 1 5811 alsl.d t5, a1, t5, 1 5812 5813 adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 5814 5815 addi.d t2, sp, 64+128 5816 5817 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5818 vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 5819 5820 adst16_core_lsx , , 5821 5822 addi.d a0, a0, 8 5823 5824 addi.d t2, a0, 0 5825 alsl.d t3, a1, a0, 1 5826 addi.d t4, a0, 0 5827 add.d t5, a1, a0 5828 5829 adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 5830 5831 alsl.d t2, a1, t2, 2 5832 alsl.d t3, a1, t3, 2 5833 5834 alsl.d t4, a1, t4, 1 5835 alsl.d t5, a1, t5, 1 5836 5837 adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 5838 5839 free_space 256+256 5840endfunc 5841 5842const shufb 5843 .byte 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 5844endconst 5845 5846function inv_txfm_add_flipadst_dct_16x16_8bpc_lsx 5847 malloc_space 256+256 5848 5849 addi.d t1, sp, 64 5850 addi.d t2, a2, 0 5851 5852 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5853 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 5854 5855 adst16_core_lsx transpose8x8, 2, vst_x16 5856 5857 addi.d t2, a2, 16 5858 addi.d t1, t1, 256 5859 5860 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5861 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 5862 5863 adst16_core_lsx transpose8x8, 2, vst_x16 5864 5865 vreplgr2vr.h vr23, zero 5866.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 5867 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ 5868 464, 480, 496 5869 vst vr23, a2, \i 5870.endr 5871 5872 addi.d t2, sp, 64 5873 5874 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5875 vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 5876 5877 dct_8x16_core_lsx 5878 5879 la.local t0, shufb 5880 vld vr0, t0, 0 5881 5882.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5883 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5884 vshuf.b \i, \i, \i, vr0 5885.endr 5886 5887 vst_x8 t2, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16 5888 vst_x8 t2, 256, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5889 5890 addi.d t2, sp, 64+128 5891 5892 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5893 vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 5894 5895 dct_8x16_core_lsx 5896 5897 la.local t0, shufb 5898 vld vr0, t0, 0 5899 5900.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5901 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5902 vshuf.b \i, \i, \i, vr0 5903.endr 5904 5905 alsl.d t2, a1, a0, 1 5906 vld vr4, sp, 64 5907 vld vr5, sp, 80 5908 vld vr6, sp, 96 5909 vld vr7, sp, 112 5910 VLD_DST_ADD_W16 vr22, vr4, vr18, vr5, vr17, vr6, vr28, vr7, 4 5911 5912 alsl.d a0, a1, a0, 2 5913 alsl.d t2, a1, a0, 1 5914 vld vr4, sp, 128 5915 vld vr5, sp, 144 5916 vld vr6, sp, 160 5917 vld vr7, sp, 176 5918 VLD_DST_ADD_W16 vr20, vr4, vr14, vr5, vr15, vr6, vr16, vr7, 4 5919 5920 alsl.d a0, a1, a0, 2 5921 alsl.d t2, a1, a0, 1 5922 vld vr4, sp, 320 5923 vld vr5, sp, 336 5924 vld vr6, sp, 352 5925 vld vr7, sp, 368 5926 VLD_DST_ADD_W16 vr27, vr4, vr30, vr5, vr23, vr6, vr21, vr7, 4 5927 5928 alsl.d a0, a1, a0, 2 5929 alsl.d t2, a1, a0, 1 5930 vld vr4, sp, 384 5931 vld vr5, sp, 400 5932 vld vr6, sp, 416 5933 vld vr7, sp, 432 5934 VLD_DST_ADD_W16 vr29, vr4, vr26, vr5, vr25, vr6, vr24, vr7, 4 5935 5936 free_space 256+256 5937endfunc 5938 5939function inv_txfm_add_dct_flipadst_16x16_8bpc_lsx 5940 malloc_space 256+256 5941 5942 vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5943 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 5944 5945 dct_8x16_core_lsx 5946 5947 LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5948 vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5949 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5950 5951 LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ 5952 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ 5953 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5954 5955.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5956 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5957 vsrari.h \i, \i, 2 5958.endr 5959 5960 vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5961 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5962 5963 vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 5964 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 5965 5966 dct_8x16_core_lsx 5967 5968 LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5969 vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5970 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5971 5972 LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ 5973 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ 5974 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5975 5976.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5977 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5978 vsrari.h \i, \i, 2 5979.endr 5980 5981 vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 5982 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 5983 5984 vreplgr2vr.h vr31, zero 5985 5986.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 5987 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ 5988 464, 480, 496 5989 vst vr31, a2, \i 5990.endr 5991 5992 addi.d t2, sp, 64 5993 5994 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 5995 vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 5996 5997 adst16_core_lsx , , 5998 5999 // out0 out1 out2 out3 out4 out5 out6 out7 6000 // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 6001 // out8 out9 out10 out11 out12 out13 out14 out15 6002 // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 6003 6004 la.local t0, shufb 6005 vld vr31, t0, 0 6006 6007 addi.d t2, a0, 0 6008 alsl.d t3, a1, a0, 1 6009 addi.d t4, a0, 0 6010 add.d t5, a1, a0 6011 6012 adst16_core_finish_lsx vr15, vr13, vr9, vr3, vr17, vr19, vr0, vr1 6013 6014 alsl.d t2, a1, t2, 2 6015 alsl.d t3, a1, t3, 2 6016 6017 alsl.d t4, a1, t4, 1 6018 alsl.d t5, a1, t5, 1 6019 6020 adst16_core_finish_lsx vr10, vr8, vr4, vr7, vr5, vr2, vr18, vr14 6021 6022 addi.d t2, sp, 64+128 6023 6024 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 6025 vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 6026 6027 adst16_core_lsx , , 6028 6029 addi.d a0, a0, 8 6030 6031 la.local t0, shufb 6032 vld vr31, t0, 0 6033 6034 addi.d t2, a0, 0 6035 alsl.d t3, a1, a0, 1 6036 addi.d t4, a0, 0 6037 add.d t5, a1, a0 6038 6039 adst16_core_finish_lsx vr15, vr13, vr9, vr3, vr17, vr19, vr0, vr1 6040 6041 alsl.d t2, a1, t2, 2 6042 alsl.d t3, a1, t3, 2 6043 6044 alsl.d t4, a1, t4, 1 6045 alsl.d t5, a1, t5, 1 6046 6047 adst16_core_finish_lsx vr10, vr8, vr4, vr7, vr5, vr2, vr18, vr14 6048 6049 free_space 256+256 6050 6051endfunc 6052 6053function inv_txfm_add_dct_dct_8x32_8bpc_lsx 6054 bnez a3, .NO_HAS_DCONLY_8x32 6055 6056 ld.h t2, a2, 0 // dc 6057 vldi vr0, 0x8b5 // 181 6058 vreplgr2vr.w vr1, t2 6059 vldi vr5, 0x880 // 128 6060 vmul.w vr2, vr0, vr1 // dc * 181 6061 st.h zero, a2, 0 6062 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 6063 vld vr10, a0, 0 // 0 1 2 3 4 5 6 7 6064 vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift 6065 vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15 6066 alsl.d t2, a1, a0, 1 6067 vmadd.w vr5, vr2, vr0 6068 vld vr12, t2, 0 // 16 17 18 19 20 21 22 23 6069 vssrarni.h.w vr5, vr5, 12 6070 vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31 6071 6072 DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 6073 6074.rept 7 6075 alsl.d a0, a1, a0, 2 6076 alsl.d t2, a1, a0, 1 6077 6078 VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 6079.endr 6080 6081 b .DCT_DCT_8X32_END 6082 6083.NO_HAS_DCONLY_8x32: 6084 malloc_space 512 6085 6086 vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 6087 6088 la.local t0, idct_coeffs 6089 6090 dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 6091 vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 6092 6093.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 6094 vsrari.h \i, \i, 2 6095.endr 6096 6097 LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 6098 vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 6099 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 6100 6101 vst_x8 sp, 64, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 6102 6103 vld_x8 a2, 16, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 6104 6105 dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 6106 vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 6107 6108.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 6109 vsrari.h \i, \i, 2 6110.endr 6111 6112 LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 6113 vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 6114 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 6115 6116 vst_x8 sp, 192, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 6117 6118 vld_x8 a2, 32, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 6119 6120 la.local t0, idct_coeffs 6121 6122 dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 6123 vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 6124 6125.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 6126 vsrari.h \i, \i, 2 6127.endr 6128 6129 LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 6130 vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 6131 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 6132 6133 vst_x8 sp, 320, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 6134 6135 vld_x8 a2, 48, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 6136 6137 dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 6138 vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 6139 6140.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 6141 vsrari.h \i, \i, 2 6142.endr 6143 6144 LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 6145 vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ 6146 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 6147 6148 vst_x8 sp, 448, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 6149 6150 vreplgr2vr.h vr31, zero 6151 6152.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 6153 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ 6154 464, 480, 496 6155 vst vr31, a2, \i 6156.endr 6157 6158 addi.d t2, sp, 64 6159 addi.d t3, sp, 64 6160 6161 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 6162 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 6163 6164 dct_8x16_core_lsx 6165 6166 vst_x16 t3, 0, 32, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 6167 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 6168 6169 vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 6170 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 6171 6172 // vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 6173 // in1 in3 in5 in7 in9 in11 in13 in15 6174 // vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 6175 // in17 in19 in21 in23 in25 in27 in29 in31 6176 6177 la.local t0, idct_coeffs 6178 vldrepl.w vr20, t0, 64 // 201 6179 vldrepl.w vr21, t0, 68 // 4091 6180 6181 vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9 6182 vssrarni.h.w vr9, vr8, 12 // t31a 6183 vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10 6184 vssrarni.h.w vr10, vr11, 12 // t16a 6185 6186 vldrepl.w vr20, t0, 72 // 3035 6187 vldrepl.w vr21, t0, 76 // 2751 6188 vmul_vmadd_w vr19, vr7, vr21, vr20, vr11, vr0 6189 vssrarni.h.w vr0, vr11, 12 // t30a 6190 vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30 6191 vssrarni.h.w vr30, vr11, 12 // t17a 6192 6193 vldrepl.w vr20, t0, 80 // 1751 6194 vldrepl.w vr21, t0, 84 // 3703 6195 vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7 6196 vssrarni.h.w vr7, vr8, 12 // t29a 6197 vmul_vmsub_w vr4, vr26, vr20, vr21, vr8, vr19 6198 vssrarni.h.w vr19, vr8, 12 // t18a 6199 6200 vldrepl.w vr20, t0, 88 // 3857 6201 vldrepl.w vr21, t0, 92 // 1380 6202 vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4 6203 vssrarni.h.w vr4, vr8, 12 // t28a 6204 vmul_vmsub_w vr27, vr3, vr20, vr21, vr8, vr26 6205 vssrarni.h.w vr26, vr8, 12 // t19a 6206 6207 vldrepl.w vr20, t0, 96 // 995 6208 vldrepl.w vr21, t0, 100 // 3973 6209 vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3 6210 vssrarni.h.w vr3, vr8, 12 // t27a 6211 vmul_vmsub_w vr2, vr28, vr20, vr21, vr8, vr27 6212 vssrarni.h.w vr27, vr8, 12 // t20a 6213 6214 vldrepl.w vr20, t0, 104 // 3513 6215 vldrepl.w vr21, t0, 108 // 2106 6216 vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2 6217 vssrarni.h.w vr2, vr8, 12 // t26a 6218 vmul_vmsub_w vr25, vr5, vr20, vr21, vr8, vr28 6219 vssrarni.h.w vr28, vr8, 12 // t21a 6220 6221 vldrepl.w vr20, t0, 112 // 2440 -> 1220 6222 vldrepl.w vr21, t0, 116 // 3290 -> 1645 6223 vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5 6224 vssrarni.h.w vr5, vr8, 12 // t25a 6225 vmul_vmsub_w vr6, vr24, vr20, vr21, vr8, vr25 6226 vssrarni.h.w vr25, vr8, 12 // t22a 6227 6228 vldrepl.w vr20, t0, 120 // 4052 6229 vldrepl.w vr21, t0, 124 // 601 6230 vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6 6231 vssrarni.h.w vr6, vr8, 12 // t24a 6232 vmul_vmsub_w vr29, vr1, vr20, vr21, vr8, vr24 6233 vssrarni.h.w vr24, vr8, 12 // t23a 6234 6235 vsadd.h vr1, vr10, vr30 // t16 6236 vssub.h vr29, vr10, vr30 // t17 6237 vssub.h vr8, vr26, vr19 // t18 6238 vsadd.h vr31, vr26, vr19 // t19 6239 vsadd.h vr10, vr27, vr28 // t20 6240 vssub.h vr30, vr27, vr28 // t21 6241 vssub.h vr19, vr24, vr25 // t22 6242 vsadd.h vr26, vr24, vr25 // t23 6243 vsadd.h vr27, vr6, vr5 // t24 6244 vssub.h vr28, vr6, vr5 // t25 6245 vssub.h vr24, vr3, vr2 // t26 6246 vsadd.h vr25, vr3, vr2 // t27 6247 vsadd.h vr5, vr4, vr7 // t28 6248 vssub.h vr6, vr4, vr7 // t29 6249 vssub.h vr2, vr9, vr0 // t30 6250 vsadd.h vr3, vr9, vr0 // t31 6251 6252 vldrepl.w vr20, t0, 16 // 799 6253 vldrepl.w vr21, t0, 20 // 4017 6254 vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 6255 vssrarni.h.w vr7, vr4, 12 // t30a 6256 vmul_vmsub_w vr2, vr29, vr20, vr21, vr4, vr0 6257 vssrarni.h.w vr0, vr4, 12 // t17a 6258 vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 6259 vneg.w vr4, vr4 6260 vneg.w vr9, vr9 6261 vssrarni.h.w vr9, vr4, 12 // t18a 6262 vmul_vmsub_w vr6, vr8, vr20, vr21, vr4, vr2 6263 vssrarni.h.w vr2, vr4, 12 // t29a 6264 6265 vldrepl.w vr20, t0, 24 // 3406 -> 1703 6266 vldrepl.w vr21, t0, 28 // 2276 -> 1138 6267 vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 6268 vssrarni.h.w vr29, vr4, 12 // t26a 6269 vmul_vmsub_w vr24, vr30, vr20, vr21, vr4, vr6 6270 vssrarni.h.w vr6, vr4, 12 // t21a 6271 6272 vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 6273 vneg.w vr4, vr4 6274 vneg.w vr8, vr8 6275 vssrarni.h.w vr8, vr4, 12 // t22a 6276 vmul_vmsub_w vr28, vr19, vr20, vr21, vr4, vr24 6277 vssrarni.h.w vr24, vr4, 12 // t25a 6278 6279 vsadd.h vr4, vr1, vr31 // t16a 6280 vssub.h vr30, vr1, vr31 // t19a 6281 vsadd.h vr19, vr0, vr9 // t17 6282 vssub.h vr28, vr0, vr9 // t18 6283 vssub.h vr1, vr26, vr10 // t20a 6284 vsadd.h vr31, vr26, vr10 // t23a 6285 vssub.h vr0, vr8, vr6 // t21 6286 vsadd.h vr9, vr8, vr6 // t22 6287 vsadd.h vr10, vr27, vr25 // t24a 6288 vssub.h vr26, vr27, vr25 // t27a 6289 vsadd.h vr6, vr24, vr29 // t25 6290 vssub.h vr8, vr24, vr29 // t26 6291 vssub.h vr25, vr3, vr5 // t28a 6292 vsadd.h vr27, vr3, vr5 // t31a 6293 vssub.h vr24, vr7, vr2 // t29 6294 vsadd.h vr29, vr7, vr2 // t30 6295 6296 vldrepl.w vr20, t0, 8 // 1567 6297 vldrepl.w vr21, t0, 12 // 3784 6298 vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 6299 vssrarni.h.w vr5, vr3, 12 // t29a 6300 vmul_vmsub_w vr24, vr28, vr20, vr21, vr3, vr2 6301 vssrarni.h.w vr2, vr3, 12 // 18a 6302 6303 vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 6304 vssrarni.h.w vr7, vr3, 12 // t28 6305 vmul_vmsub_w vr25, vr30, vr20, vr21, vr3, vr24 6306 vssrarni.h.w vr24, vr3, 12 // t19 6307 6308 vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 6309 vneg.w vr3, vr3 6310 vneg.w vr28, vr28 6311 vssrarni.h.w vr28, vr3, 12 // t20 6312 vmul_vmsub_w vr26, vr1, vr20, vr21, vr3, vr25 6313 vssrarni.h.w vr25, vr3, 12 // t27 6314 6315 vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 6316 vneg.w vr3, vr3 6317 vneg.w vr30, vr30 6318 vssrarni.h.w vr30, vr3, 12 // t21a 6319 vmul_vmsub_w vr8, vr0, vr20, vr21, vr3, vr1 6320 vssrarni.h.w vr1, vr3, 12 // t26a 6321 6322 vsadd.h vr3, vr4, vr31 // t16 6323 vssub.h vr26, vr4, vr31 // t23 6324 vsadd.h vr0, vr19, vr9 // t17a 6325 vssub.h vr8, vr19, vr9 // t22a 6326 vsadd.h vr4, vr2, vr30 // t18 6327 vssub.h vr31, vr2, vr30 // t21 6328 vsadd.h vr9, vr24, vr28 // t19a 6329 vssub.h vr19, vr24, vr28 // t20a 6330 vssub.h vr2, vr27, vr10 // t24 6331 vsadd.h vr30, vr27, vr10 // t31 6332 vssub.h vr24, vr29, vr6 // t25a 6333 vsadd.h vr28, vr29, vr6 // t30a 6334 vssub.h vr10, vr5, vr1 // t26 6335 vsadd.h vr27, vr5, vr1 // t29 6336 vssub.h vr6, vr7, vr25 // t27a 6337 vsadd.h vr29, vr7, vr25 // t28a 6338 6339 vldrepl.w vr20, t0, 0 // 2896 6340 vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 6341 vssrarni.h.w vr5, vr1, 12 // t20 6342 vmul_vmadd_w vr6, vr19, vr20, vr20, vr1, vr7 6343 vssrarni.h.w vr7, vr1, 12 // t27 6344 6345 vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 6346 vssrarni.h.w vr25, vr1, 12 // t21a 6347 vmul_vmadd_w vr10, vr31, vr20, vr20, vr1, vr6 6348 vssrarni.h.w vr6, vr1, 12 // t26a 6349 6350 vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 6351 vssrarni.h.w vr19, vr1, 12 // t22 6352 vmul_vmadd_w vr24, vr8, vr20, vr20, vr1, vr10 6353 vssrarni.h.w vr10, vr1, 12 // t25 6354 6355 vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 6356 vssrarni.h.w vr31, vr1, 12 // t23a 6357 vmul_vmadd_w vr2, vr26, vr20, vr20, vr1, vr8 6358 vssrarni.h.w vr8, vr1, 12 // t24a 6359 6360 // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 6361 // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 6362 6363 vld_x8 t3, 0, 32, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 6364 6365 vsadd.h vr1, vr11, vr30 // c[0] 6366 vssub.h vr2, vr11, vr30 // c[31] 6367 vsadd.h vr24, vr12, vr28 // c[1] 6368 vssub.h vr26, vr12, vr28 // c[30] 6369 vsadd.h vr11, vr13, vr27 // c[2] 6370 vssub.h vr30, vr13, vr27 // c[29] 6371 vsadd.h vr12, vr14, vr29 // c[3] 6372 vssub.h vr28, vr14, vr29 // c[28] 6373 vsadd.h vr13, vr15, vr7 // c[4] 6374 vssub.h vr27, vr15, vr7 // c[27] 6375 vsadd.h vr14, vr16, vr6 // c[5] 6376 vssub.h vr29, vr16, vr6 // c[26] 6377 vsadd.h vr7, vr17, vr10 // c[6] 6378 vssub.h vr15, vr17, vr10 // c[25] 6379 vsadd.h vr6, vr18, vr8 // c[7] 6380 vssub.h vr16, vr18, vr8 // c[24] 6381 6382.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ 6383 vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 6384 vsrari.h \i, \i, 4 6385.endr 6386 6387 vst_x8 t2, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 6388 6389 vst_x8 t2, 128, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 6390 6391 vld_x8 t3, 256, 32, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 6392 6393 vsadd.h vr1, vr11, vr31 // c[8] 6394 vssub.h vr2, vr11, vr31 // c[23] 6395 vsadd.h vr24, vr12, vr19 // c[9] 6396 vssub.h vr26, vr12, vr19 // c[22] 6397 vsadd.h vr11, vr13, vr25 // c[10] 6398 vssub.h vr30, vr13, vr25 // c[21] 6399 vsadd.h vr12, vr14, vr5 // c[11] 6400 vssub.h vr28, vr14, vr5 // c[20] 6401 vsadd.h vr13, vr15, vr9 // c[12] 6402 vssub.h vr27, vr15, vr9 // c[19] 6403 vsadd.h vr14, vr16, vr4 // c[13] 6404 vssub.h vr29, vr16, vr4 // c[18] 6405 vsadd.h vr7, vr17, vr0 // c[14] 6406 vssub.h vr15, vr17, vr0 // c[17] 6407 vsadd.h vr6, vr18, vr3 // c[15] 6408 vssub.h vr16, vr18, vr3 // c[16] 6409 6410.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ 6411 vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 6412 vsrari.h \i, \i, 4 6413.endr 6414 6415 vst_x8 t2, 256, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 6416 6417 vst_x8 t2, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 6418 6419 alsl.d t2, a1, a0, 1 6420 addi.d t3, sp, 64 6421 6422 vld vr4, t3, 0 6423 vld vr5, t3, 16 6424 vld vr6, t3, 32 6425 vld vr7, t3, 48 6426 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 6427 6428 addi.d t3, sp, 64+64 6429 alsl.d a0, a1, a0, 2 6430 alsl.d t2, a1, t2, 2 6431 vld vr4, t3, 0 6432 vld vr5, t3, 16 6433 vld vr6, t3, 32 6434 vld vr7, t3, 48 6435 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 6436 6437 addi.d t3, sp, 64+256 6438 alsl.d a0, a1, a0, 2 6439 alsl.d t2, a1, t2, 2 6440 vld vr4, t3, 0 6441 vld vr5, t3, 16 6442 vld vr6, t3, 32 6443 vld vr7, t3, 48 6444 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 6445 6446 addi.d t3, t3, 64 6447 alsl.d a0, a1, a0, 2 6448 alsl.d t2, a1, t2, 2 6449 vld vr4, t3, 0 6450 vld vr5, t3, 16 6451 vld vr6, t3, 32 6452 vld vr7, t3, 48 6453 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 6454 6455 addi.d t3, sp, 64+384 6456 alsl.d a0, a1, a0, 2 6457 alsl.d t2, a1, t2, 2 6458 vld vr4, t3, 0 6459 vld vr5, t3, 16 6460 vld vr6, t3, 32 6461 vld vr7, t3, 48 6462 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 6463 6464 addi.d t3, t3, 64 6465 alsl.d a0, a1, a0, 2 6466 alsl.d t2, a1, t2, 2 6467 vld vr4, t3, 0 6468 vld vr5, t3, 16 6469 vld vr6, t3, 32 6470 vld vr7, t3, 48 6471 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 6472 6473 addi.d t3, sp, 64+128 6474 alsl.d a0, a1, a0, 2 6475 alsl.d t2, a1, t2, 2 6476 vld vr4, t3, 0 6477 vld vr5, t3, 16 6478 vld vr6, t3, 32 6479 vld vr7, t3, 48 6480 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 6481 6482 addi.d t3, t3, 64 6483 alsl.d a0, a1, a0, 2 6484 alsl.d t2, a1, t2, 2 6485 vld vr4, t3, 0 6486 vld vr5, t3, 16 6487 vld vr6, t3, 32 6488 vld vr7, t3, 48 6489 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 6490 6491 free_space 512 6492.DCT_DCT_8X32_END: 6493endfunc 6494 6495.macro dct_8x32_core_lsx in1, in2, vst_start0, vst_start1, vst_start2, \ 6496 vst_start3, transpose8x8, shift 6497 6498 // vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 6499 // in1 in3 in5 in7 in9 in11 in13 in15 6500 // vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 6501 // in17 in19 in21 in23 in25 in27 in29 in31 6502 6503 la.local t0, idct_coeffs 6504 vldrepl.w vr20, t0, 64 // 201 6505 vldrepl.w vr21, t0, 68 // 4091 6506 6507 vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9 6508 vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10 6509 vssrarni.h.w vr9, vr8, 12 // t31a 6510 vssrarni.h.w vr10, vr11, 12 // t16a 6511 6512 vldrepl.w vr20, t0, 72 // 3035 6513 vldrepl.w vr21, t0, 76 // 2751 6514 vmul_vmadd_w vr19, vr7, vr21, vr20, vr8, vr0 6515 vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30 6516 vssrarni.h.w vr0, vr8, 12 // t30a 6517 vssrarni.h.w vr30, vr11, 12 // t17a 6518 6519 vldrepl.w vr20, t0, 80 // 1751 6520 vldrepl.w vr21, t0, 84 // 3703 6521 vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7 6522 vmul_vmsub_w vr4, vr26, vr20, vr21, vr11, vr19 6523 vssrarni.h.w vr7, vr8, 12 // t29a 6524 vssrarni.h.w vr19, vr11, 12 // t18a 6525 6526 vldrepl.w vr20, t0, 88 // 3857 6527 vldrepl.w vr21, t0, 92 // 1380 6528 vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4 6529 vmul_vmsub_w vr27, vr3, vr20, vr21, vr11, vr26 6530 vssrarni.h.w vr4, vr8, 12 // t28a 6531 vssrarni.h.w vr26, vr11, 12 // t19a 6532 6533 vldrepl.w vr20, t0, 96 // 995 6534 vldrepl.w vr21, t0, 100 // 3973 6535 vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3 6536 vmul_vmsub_w vr2, vr28, vr20, vr21, vr11, vr27 6537 vssrarni.h.w vr3, vr8, 12 // t27a 6538 vssrarni.h.w vr27, vr11, 12 // t20a 6539 6540 vldrepl.w vr20, t0, 104 // 3513 6541 vldrepl.w vr21, t0, 108 // 2106 6542 vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2 6543 vmul_vmsub_w vr25, vr5, vr20, vr21, vr11, vr28 6544 vssrarni.h.w vr2, vr8, 12 // t26a 6545 vssrarni.h.w vr28, vr11, 12 // t21a 6546 6547 vldrepl.w vr20, t0, 112 // 2440 -> 1220 6548 vldrepl.w vr21, t0, 116 // 3290 -> 1645 6549 vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5 6550 vmul_vmsub_w vr6, vr24, vr20, vr21, vr11, vr25 6551 vssrarni.h.w vr5, vr8, 12 // t25a 6552 vssrarni.h.w vr25, vr11, 12 // t22a 6553 6554 vldrepl.w vr20, t0, 120 // 4052 6555 vldrepl.w vr21, t0, 124 // 601 6556 vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6 6557 vmul_vmsub_w vr29, vr1, vr20, vr21, vr11, vr24 6558 vssrarni.h.w vr6, vr8, 12 // t24a 6559 vssrarni.h.w vr24, vr11, 12 // t23a 6560 6561 vsadd.h vr1, vr10, vr30 // t16 6562 vssub.h vr29, vr10, vr30 // t17 6563 vssub.h vr8, vr26, vr19 // t18 6564 vsadd.h vr31, vr26, vr19 // t19 6565 vsadd.h vr10, vr27, vr28 // t20 6566 vssub.h vr30, vr27, vr28 // t21 6567 vssub.h vr19, vr24, vr25 // t22 6568 vsadd.h vr26, vr24, vr25 // t23 6569 vsadd.h vr27, vr6, vr5 // t24 6570 vssub.h vr28, vr6, vr5 // t25 6571 vssub.h vr24, vr3, vr2 // t26 6572 vsadd.h vr25, vr3, vr2 // t27 6573 vsadd.h vr5, vr4, vr7 // t28 6574 vssub.h vr6, vr4, vr7 // t29 6575 vssub.h vr2, vr9, vr0 // t30 6576 vsadd.h vr3, vr9, vr0 // t31 6577 6578 vldrepl.w vr20, t0, 16 // 799 6579 vldrepl.w vr21, t0, 20 // 4017 6580 vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 6581 vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0 6582 vssrarni.h.w vr7, vr4, 12 // t30a 6583 vssrarni.h.w vr0, vr11, 12 // t17a 6584 vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 6585 vneg.w vr4, vr4 6586 vneg.w vr9, vr9 6587 vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2 6588 vssrarni.h.w vr9, vr4, 12 // t18a 6589 vssrarni.h.w vr2, vr11, 12 // t29a 6590 6591 vldrepl.w vr20, t0, 24 // 3406 -> 1703 6592 vldrepl.w vr21, t0, 28 // 2276 -> 1138 6593 vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 6594 vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6 6595 vssrarni.h.w vr29, vr4, 12 // t26a 6596 vssrarni.h.w vr6, vr11, 12 // t21a 6597 6598 vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 6599 vneg.w vr4, vr4 6600 vneg.w vr8, vr8 6601 vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24 6602 vssrarni.h.w vr8, vr4, 12 // t22a 6603 vssrarni.h.w vr24, vr11, 12 // t25a 6604 6605 vsadd.h vr4, vr1, vr31 // t16a 6606 vssub.h vr30, vr1, vr31 // t19a 6607 vsadd.h vr19, vr0, vr9 // t17 6608 vssub.h vr28, vr0, vr9 // t18 6609 vssub.h vr1, vr26, vr10 // t20a 6610 vsadd.h vr31, vr26, vr10 // t23a 6611 vssub.h vr0, vr8, vr6 // t21 6612 vsadd.h vr9, vr8, vr6 // t22 6613 vsadd.h vr10, vr27, vr25 // t24a 6614 vssub.h vr26, vr27, vr25 // t27a 6615 vsadd.h vr6, vr24, vr29 // t25 6616 vssub.h vr8, vr24, vr29 // t26 6617 vssub.h vr25, vr3, vr5 // t28a 6618 vsadd.h vr27, vr3, vr5 // t31a 6619 vssub.h vr24, vr7, vr2 // t29 6620 vsadd.h vr29, vr7, vr2 // t30 6621 6622 vldrepl.w vr20, t0, 8 // 1567 6623 vldrepl.w vr21, t0, 12 // 3784 6624 vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 6625 vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2 6626 vssrarni.h.w vr5, vr3, 12 // t29a 6627 vssrarni.h.w vr2, vr11, 12 // 18a 6628 6629 vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 6630 vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24 6631 vssrarni.h.w vr7, vr3, 12 // t28 6632 vssrarni.h.w vr24, vr11, 12 // t19 6633 6634 vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 6635 vneg.w vr3, vr3 6636 vneg.w vr28, vr28 6637 vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25 6638 vssrarni.h.w vr28, vr3, 12 // t20 6639 vssrarni.h.w vr25, vr11, 12 // t27 6640 6641 vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 6642 vneg.w vr3, vr3 6643 vneg.w vr30, vr30 6644 vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1 6645 vssrarni.h.w vr30, vr3, 12 // t21a 6646 vssrarni.h.w vr1, vr11, 12 // t26a 6647 6648 vsadd.h vr3, vr4, vr31 // t16 6649 vssub.h vr26, vr4, vr31 // t23 6650 vsadd.h vr0, vr19, vr9 // t17a 6651 vssub.h vr8, vr19, vr9 // t22a 6652 vsadd.h vr4, vr2, vr30 // t18 6653 vssub.h vr31, vr2, vr30 // t21 6654 vsadd.h vr9, vr24, vr28 // t19a 6655 vssub.h vr19, vr24, vr28 // t20a 6656 vssub.h vr2, vr27, vr10 // t24 6657 vsadd.h vr30, vr27, vr10 // t31 6658 vssub.h vr24, vr29, vr6 // t25a 6659 vsadd.h vr28, vr29, vr6 // t30a 6660 vssub.h vr10, vr5, vr1 // t26 6661 vsadd.h vr27, vr5, vr1 // t29 6662 vssub.h vr6, vr7, vr25 // t27a 6663 vsadd.h vr29, vr7, vr25 // t28a 6664 6665 vldrepl.w vr20, t0, 0 // 2896 6666 vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 6667 vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7 6668 vssrarni.h.w vr5, vr1, 12 // t20 6669 vssrarni.h.w vr7, vr11, 12 // t27 6670 6671 vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 6672 vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6 6673 vssrarni.h.w vr25, vr1, 12 // t21a 6674 vssrarni.h.w vr6, vr11, 12 // t26a 6675 6676 vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 6677 vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10 6678 vssrarni.h.w vr19, vr1, 12 // t22 6679 vssrarni.h.w vr10, vr11, 12 // t25 6680 6681 vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 6682 vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8 6683 vssrarni.h.w vr31, vr1, 12 // t23a 6684 vssrarni.h.w vr8, vr11, 12 // t24a 6685 6686 // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 6687 // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 6688 6689 vld_x8 \in2, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 6690 6691 vsadd.h vr1, vr11, vr30 // c[0] 6692 vssub.h vr2, vr11, vr30 // c[31] 6693 vsadd.h vr24, vr12, vr28 // c[1] 6694 vssub.h vr26, vr12, vr28 // c[30] 6695 vsadd.h vr11, vr13, vr27 // c[2] 6696 vssub.h vr30, vr13, vr27 // c[29] 6697 vsadd.h vr12, vr14, vr29 // c[3] 6698 vssub.h vr28, vr14, vr29 // c[28] 6699 vsadd.h vr13, vr15, vr7 // c[4] 6700 vssub.h vr27, vr15, vr7 // c[27] 6701 vsadd.h vr14, vr16, vr6 // c[5] 6702 vssub.h vr29, vr16, vr6 // c[26] 6703 vsadd.h vr7, vr17, vr10 // c[6] 6704 vssub.h vr15, vr17, vr10 // c[25] 6705 vsadd.h vr6, vr18, vr8 // c[7] 6706 vssub.h vr16, vr18, vr8 // c[24] 6707 6708.ifnb \transpose8x8 6709 LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ 6710 vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ 6711 vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 6712.endif 6713 6714.ifnb \shift 6715.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 6716 vsrari.h \i, \i, \shift 6717.endr 6718.endif 6719 6720 vst_x8 \in1, \vst_start0, 64, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 6721 6722.ifnb \transpose8x8 6723 LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ 6724 vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ 6725 vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 6726.endif 6727 6728.ifnb \shift 6729.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 6730 vsrari.h \i, \i, \shift 6731.endr 6732.endif 6733 6734 vst_x8 \in1, \vst_start3, 64, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 6735 6736 vld_x8 \in2, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 6737 6738 vsadd.h vr1, vr11, vr31 // c[8] 6739 vssub.h vr2, vr11, vr31 // c[23] 6740 vsadd.h vr24, vr12, vr19 // c[9] 6741 vssub.h vr26, vr12, vr19 // c[22] 6742 vsadd.h vr11, vr13, vr25 // c[10] 6743 vssub.h vr30, vr13, vr25 // c[21] 6744 vsadd.h vr12, vr14, vr5 // c[11] 6745 vssub.h vr28, vr14, vr5 // c[20] 6746 vsadd.h vr13, vr15, vr9 // c[12] 6747 vssub.h vr27, vr15, vr9 // c[19] 6748 vsadd.h vr14, vr16, vr4 // c[13] 6749 vssub.h vr29, vr16, vr4 // c[18] 6750 vsadd.h vr7, vr17, vr0 // c[14] 6751 vssub.h vr15, vr17, vr0 // c[17] 6752 vsadd.h vr6, vr18, vr3 // c[15] 6753 vssub.h vr16, vr18, vr3 // c[16] 6754 6755.ifnb \transpose8x8 6756 LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ 6757 vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ 6758 vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 6759.endif 6760 6761.ifnb \shift 6762.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 6763 vsrari.h \i, \i, \shift 6764.endr 6765.endif 6766 6767 vst_x8 \in1, \vst_start1, 64, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 6768 6769.ifnb \transpose8x8 6770 LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ 6771 vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ 6772 vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 6773.endif 6774 6775.ifnb \shift 6776.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 6777 vsrari.h \i, \i, \shift 6778.endr 6779.endif 6780 6781 vst_x8 \in1, \vst_start2, 64, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 6782.endm 6783 6784function inv_txfm_add_dct_dct_32x32_8bpc_lsx 6785 bnez a3, .NO_HAS_DCONLY_32x32 6786 6787 ld.h t2, a2, 0 // dc 6788 vldi vr0, 0x8b5 // 181 6789 vreplgr2vr.w vr1, t2 6790 vldi vr20, 0x880 // 128 6791 vmul.w vr2, vr0, vr1 // dc * 181 6792 st.h zero, a2, 0 6793 add.d t0, a0, a1 6794 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 6795 vld vr3, t0, 16 6796 vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift 6797 vld vr1, a0, 16 6798 vmadd.w vr20, vr2, vr0 6799 vld vr2, t0, 0 6800 vssrarni.h.w vr20, vr20, 12 6801 vld vr0, a0, 0 6802 6803 vsllwil.hu.bu vr4, vr0, 0 6804 vsllwil.hu.bu vr5, vr1, 0 6805 vsllwil.hu.bu vr6, vr2, 0 6806 vsllwil.hu.bu vr7, vr3, 0 6807 vexth.hu.bu vr0, vr0 6808 vexth.hu.bu vr1, vr1 6809 vexth.hu.bu vr2, vr2 6810 vexth.hu.bu vr3, vr3 6811 vadd.h vr8, vr4, vr20 6812 vadd.h vr9, vr0, vr20 6813 vadd.h vr10, vr5, vr20 6814 vadd.h vr11, vr1, vr20 6815 vadd.h vr12, vr6, vr20 6816 vadd.h vr13, vr2, vr20 6817 vadd.h vr14, vr7, vr20 6818 vadd.h vr15, vr3, vr20 6819 vssrani.bu.h vr9, vr8, 0 6820 vssrani.bu.h vr11, vr10, 0 6821 vssrani.bu.h vr13, vr12, 0 6822 vssrani.bu.h vr15, vr14, 0 6823 vst vr9, a0, 0 6824 vst vr11, a0, 16 6825 vst vr13, t0, 0 6826 vst vr15, t0, 16 6827 6828.rept 15 6829 alsl.d a0, a1, a0, 1 6830 add.d t0, a0, a1 6831 6832 vld vr0, a0, 0 6833 vld vr1, a0, 16 6834 vld vr2, t0, 0 6835 vld vr3, t0, 16 6836 vsllwil.hu.bu vr4, vr0, 0 6837 vsllwil.hu.bu vr5, vr1, 0 6838 vsllwil.hu.bu vr6, vr2, 0 6839 vsllwil.hu.bu vr7, vr3, 0 6840 vexth.hu.bu vr0, vr0 6841 vexth.hu.bu vr1, vr1 6842 vexth.hu.bu vr2, vr2 6843 vexth.hu.bu vr3, vr3 6844 vadd.h vr8, vr4, vr20 6845 vadd.h vr9, vr0, vr20 6846 vadd.h vr10, vr5, vr20 6847 vadd.h vr11, vr1, vr20 6848 vadd.h vr12, vr6, vr20 6849 vadd.h vr13, vr2, vr20 6850 vadd.h vr14, vr7, vr20 6851 vadd.h vr15, vr3, vr20 6852 vssrani.bu.h vr9, vr8, 0 6853 vssrani.bu.h vr11, vr10, 0 6854 vssrani.bu.h vr13, vr12, 0 6855 vssrani.bu.h vr15, vr14, 0 6856 vst vr9, a0, 0 6857 vst vr11, a0, 16 6858 vst vr13, t0, 0 6859 vst vr15, t0, 16 6860.endr 6861 6862 b .DCT_DCT_32X32_END 6863.NO_HAS_DCONLY_32x32: 6864 6865 malloc_space 2560 // 32*32*2+512 6866 6867 addi.d t1, sp, 64 6868 addi.d t2, a2, 0 6869 addi.d t3, sp, 1024 6870 addi.d t3, t3, 1024 6871 addi.d t3, t3, 64 6872 6873 vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 6874 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 6875 6876 dct_8x16_core_lsx 6877 6878 vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 6879 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 6880 6881 vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 6882 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 6883 6884 dct_8x32_core_lsx t1, t3, 0, 16, 32, 48, transpose8x8, 2 6885 6886.rept 3 6887 addi.d t2, t2, 16 6888 addi.d t1, t1, 512 6889 6890 vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 6891 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 6892 6893 dct_8x16_core_lsx 6894 6895 vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 6896 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 6897 6898 vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 6899 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 6900 6901 dct_8x32_core_lsx t1, t3, 0, 16, 32, 48, transpose8x8, 2 6902.endr 6903 6904 vreplgr2vr.h vr31, zero 6905.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, 1040, 1056, 1072, 1088, 1104, 1120, 1136, 1152, 1168, 1184, 1200, 1216, 1232, 1248, 1264, 1280, 1296, 1312, 1328, 1344, 1360, 1376, 1392, 1408, 1424, 1440, 1456, 1472, 1488, 1504, 1520, 1536, 1552, 1568, 1584, 1600, 1616, 1632, 1648, 1664, 1680, 1696, 1712, 1728, 1744, 1760, 1776, 1792, 1808, 1824, 1840, 1856, 1872, 1888, 1904, 1920, 1936, 1952, 1968, 1984, 2000, 2016, 2032 6906 vst vr31, a2, \i 6907.endr 6908 6909 addi.d t2, sp, 64 6910 addi.d t1, sp, 64 6911 6912 vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 6913 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 6914 6915 dct_8x16_core_lsx 6916 6917 vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 6918 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 6919 6920 vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 6921 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 6922 6923 dct_8x32_core_lsx t1, t3, 0, 512, 1024, 1536, , 4 6924 6925.rept 3 6926 addi.d t2, t2, 16 6927 addi.d t1, t1, 16 6928 6929 vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 6930 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 6931 6932 dct_8x16_core_lsx 6933 6934 vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 6935 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 6936 6937 vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 6938 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 6939 6940 dct_8x32_core_lsx t1, t3, 0, 512, 1024, 1536, , 4 6941.endr 6942 6943 addi.d t2, sp, 64 6944 6945.rept 16 6946 add.d t0, a0, a1 6947 vld vr0, a0, 0 6948 vld vr1, a0, 16 6949 vld vr2, t0, 0 6950 vld vr3, t0, 16 6951 vsllwil.hu.bu vr4, vr0, 0 6952 vsllwil.hu.bu vr5, vr1, 0 6953 vsllwil.hu.bu vr6, vr2, 0 6954 vsllwil.hu.bu vr7, vr3, 0 6955 vexth.hu.bu vr0, vr0 6956 vexth.hu.bu vr1, vr1 6957 vexth.hu.bu vr2, vr2 6958 vexth.hu.bu vr3, vr3 6959 vld_x8 t2, 0, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 6960 vadd.h vr8, vr4, vr8 6961 vadd.h vr9, vr0, vr9 6962 vadd.h vr10, vr5, vr10 6963 vadd.h vr11, vr1, vr11 6964 vadd.h vr12, vr6, vr12 6965 vadd.h vr13, vr2, vr13 6966 vadd.h vr14, vr7, vr14 6967 vadd.h vr15, vr3, vr15 6968 vssrani.bu.h vr9, vr8, 0 6969 vssrani.bu.h vr11, vr10, 0 6970 vssrani.bu.h vr13, vr12, 0 6971 vssrani.bu.h vr15, vr14, 0 6972 vst vr9, a0, 0 6973 vst vr11, a0, 16 6974 vst vr13, t0, 0 6975 vst vr15, t0, 16 6976 6977 alsl.d a0, a1, a0, 1 6978 addi.d t2, t2, 128 6979.endr 6980 6981 free_space 2560 // 32*32*2+512 6982 6983.DCT_DCT_32X32_END: 6984endfunc 6985 6986.macro dct_8x8_tx64_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, \ 6987 out0, out1, out2, out3, out4, out5, out6, out7 6988 6989 // in0 in1 in2 in3 6990 // dct4 in0 in2 6991 la.local t0, idct_coeffs 6992 6993 vldrepl.w vr20, t0, 8 // 1567 6994 vldrepl.w vr21, t0, 12 // 3784 6995 vsllwil.w.h vr22, \in2, 0 6996 vexth.w.h vr23, \in2 6997 vmul.w vr8, vr22, vr20 6998 vmul.w vr10, vr23, vr20 6999 vmul.w \in2, vr22, vr21 7000 vmul.w vr9, vr23, vr21 7001 vssrarni.h.w vr10, vr8, 12 // t2 7002 vssrarni.h.w vr9, \in2, 12 // t3 7003 7004 vldrepl.w vr20, t0, 0 // 2896 7005 vsllwil.w.h vr22, \in0, 0 7006 vexth.w.h vr23, \in0 7007 vmul.w vr8, vr22, vr20 7008 vmul.w \in2, vr23, vr20 7009 vssrarni.h.w \in2, vr8, 12 7010 7011 vsadd.h vr8, \in2, vr9 // c[0] 7012 vssub.h vr9, \in2, vr9 // c[3] 7013 vsadd.h \in0, \in2, vr10 // c[1] 7014 vssub.h vr10, \in2, vr10 // c[2] 7015 7016 // inv_dct8_1d_internal_c tx64 7017 // in1 in3 7018 vldrepl.w vr20, t0, 16 // 799 7019 vldrepl.w vr21, t0, 20 // 4017 7020 7021 vsllwil.w.h vr22, \in1, 0 7022 vexth.w.h vr23, \in1 7023 vmul.w \in2, vr22, vr21 7024 vmul.w \in4, vr23, vr21 7025 vmul.w \in1, vr22, vr20 7026 vmul.w \in6, vr23, vr20 7027 vssrarni.h.w \in4, \in2, 12 // t7a 7028 vssrarni.h.w \in6, \in1, 12 // t4a 7029 7030 vldrepl.w vr20, t0, 24 // 3406 7031 vldrepl.w vr21, t0, 28 // 2276 7032 7033 vsllwil.w.h vr22, \in3, 0 7034 vexth.w.h vr23, \in3 7035 vneg.w vr21, vr21 7036 vmul.w \in2, vr22, vr20 7037 vmul.w \in1, vr23, vr20 7038 vmul.w \in3, vr22, vr21 7039 vmul.w \in7, vr23, vr21 7040 vssrarni.h.w \in1, \in2, 12 // t6a 7041 vssrarni.h.w \in7, \in3, 12 // t5a 7042 7043 vsadd.h \in3, \in6, \in7 // t4 7044 vssub.h \in6, \in6, \in7 // t5a 7045 vsadd.h \in5, \in4, \in1 // t7 7046 vssub.h \in4, \in4, \in1 // t6a 7047 7048 vldrepl.w vr20, t0, 0 // 2896 7049 vmul_vmadd_w \in4, \in6, vr20, vr20, vr21, \in1 7050 vmul_vmsub_w \in4, \in6, vr20, vr20, \in2, \in7 7051 vssrarni.h.w \in1, vr21, 12 // t6 7052 vssrarni.h.w \in7, \in2, 12 // t5 7053 7054 vsadd.h \out0, vr8, \in5 // c[0] 7055 vssub.h \out7, vr8, \in5 // c[7] 7056 vsadd.h \out1, \in0, \in1 // c[1] 7057 vssub.h \out6, \in0, \in1 // c[6] 7058 vsadd.h \out2, vr10, \in7 // c[2] 7059 vssub.h \out5, vr10, \in7 // c[5] 7060 vsadd.h \out3, vr9, \in3 // c[3] 7061 vssub.h \out4, vr9, \in3 // c[4] 7062.endm 7063 7064.macro dct_8x16_tx64_core_lsx 7065 dct_8x8_tx64_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, vr11, \ 7066 vr12, vr13, vr14, vr15, vr16, vr17, vr18 7067 7068 // in1 in3 in5 in7 in9 in11 in13 in15 7069 // vr1 vr3 vr5 vr7 vr24 vr26 vr28 vr30 7070 la.local t0, idct_coeffs 7071 7072 vldrepl.w vr20, t0, 32 // 401 7073 vldrepl.w vr21, t0, 36 // 4076 7074 vsllwil.w.h vr22, vr1, 0 7075 vexth.w.h vr23, vr1 7076 vmul.w vr0, vr22, vr21 7077 vmul.w vr10, vr23, vr21 7078 vmul.w vr1, vr22, vr20 7079 vmul.w vr29, vr23, vr20 7080 vssrarni.h.w vr10, vr0, 12 // t15a 7081 vssrarni.h.w vr29, vr1, 12 // t8a 7082 7083 vldrepl.w vr20, t0, 40 // 3166 -> 1583 7084 vldrepl.w vr21, t0, 44 // 2598 -> 1299 7085 vsllwil.w.h vr22, vr7, 0 7086 vexth.w.h vr23, vr7 7087 vneg.w vr21, vr21 7088 vmul.w vr0, vr22, vr20 7089 vmul.w vr30, vr23, vr20 7090 vmul.w vr7, vr22, vr21 7091 vmul.w vr31, vr23, vr21 7092 vssrarni.h.w vr30, vr0, 12 // t14a 7093 vssrarni.h.w vr31, vr7, 12 // t9a 7094 7095 vldrepl.w vr20, t0, 48 // 1931 7096 vldrepl.w vr21, t0, 52 // 3612 7097 vsllwil.w.h vr22, vr5, 0 7098 vexth.w.h vr23, vr5 7099 vmul.w vr0, vr22, vr21 7100 vmul.w vr24, vr23, vr21 7101 vmul.w vr5, vr22, vr20 7102 vmul.w vr25, vr23, vr20 7103 vssrarni.h.w vr24, vr0, 12 // t13a 7104 vssrarni.h.w vr25, vr5, 12 // t10a 7105 7106 vldrepl.w vr20, t0, 56 // 3920 7107 vldrepl.w vr21, t0, 60 // 1189 7108 vsllwil.w.h vr22, vr3, 0 7109 vexth.w.h vr23, vr3 7110 vneg.w vr21, vr21 7111 vmul.w vr0, vr22, vr20 7112 vmul.w vr26, vr23, vr20 7113 vmul.w vr3, vr22, vr21 7114 vmul.w vr27, vr23, vr21 7115 vssrarni.h.w vr26, vr0, 12 // t12a 7116 vssrarni.h.w vr27, vr3, 12 // t11a 7117 7118 // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27 7119 vsadd.h vr28, vr29, vr31 // t8 7120 vssub.h vr19, vr29, vr31 // t9 7121 vssub.h vr29, vr27, vr25 // t10 7122 vsadd.h vr9, vr27, vr25 // t11 7123 vsadd.h vr31, vr26, vr24 // t12 7124 vssub.h vr25, vr26, vr24 // t13 7125 vssub.h vr27, vr10, vr30 // t14 7126 vsadd.h vr24, vr10, vr30 // t15 7127 7128 vldrepl.w vr20, t0, 8 // 1567 7129 vldrepl.w vr21, t0, 12 // 3784 7130 vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26 7131 vmul_vmsub_w vr27, vr19, vr20, vr21, vr1, vr30 7132 vssrarni.h.w vr26, vr0, 12 // t14a 7133 vssrarni.h.w vr30, vr1, 12 // t9a 7134 7135 vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19 7136 vneg.w vr0, vr0 7137 vneg.w vr19, vr19 7138 vmul_vmsub_w vr25, vr29, vr20, vr21, vr1, vr27 7139 vssrarni.h.w vr19, vr0, 12 // t10a 7140 vssrarni.h.w vr27, vr1, 12 // t13a 7141 7142 vsadd.h vr25, vr28, vr9 // t8a 7143 vssub.h vr29, vr28, vr9 // t11a 7144 vssub.h vr28, vr24, vr31 // t12a 7145 vsadd.h vr10, vr24, vr31 // t15a 7146 vsadd.h vr9, vr30, vr19 // t9 7147 vssub.h vr31, vr30, vr19 // t10 7148 vssub.h vr30, vr26, vr27 // t13 7149 vsadd.h vr24, vr26, vr27 // t14 7150 7151 vldrepl.w vr20, t0, 0 // 2896 7152 vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26 7153 vmul_vmsub_w vr30, vr31, vr20, vr20, vr1, vr27 7154 vssrarni.h.w vr26, vr0, 12 // t13a 7155 vssrarni.h.w vr27, vr1, 12 // t10a 7156 7157 vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31 7158 vmul_vmsub_w vr28, vr29, vr20, vr20, vr1, vr30 7159 vssrarni.h.w vr31, vr0, 12 // t12 7160 vssrarni.h.w vr30, vr1, 12 // t11 7161 7162 // vr11 vr12 ... vr18 7163 vsadd.h vr28, vr14, vr31 // c[3] 7164 vssub.h vr29, vr14, vr31 // c[12] 7165 vsadd.h vr20, vr15, vr30 // c[4] 7166 vssub.h vr21, vr15, vr30 // c[11] 7167 vsadd.h vr14, vr16, vr27 // c[5] 7168 vssub.h vr23, vr16, vr27 // c[10] 7169 vsadd.h vr15, vr17, vr9 // c[6] 7170 vssub.h vr30, vr17, vr9 // c[9] 7171 vsadd.h vr16, vr18, vr25 // c[7] 7172 vssub.h vr27, vr18, vr25 // c[8] 7173 vsadd.h vr17, vr13, vr26 // c[2] 7174 vssub.h vr26, vr13, vr26 // c[13] 7175 vsadd.h vr18, vr12, vr24 // c[1] 7176 vssub.h vr25, vr12, vr24 // c[14] 7177 vsadd.h vr22, vr11, vr10 // c[0] 7178 vssub.h vr24, vr11, vr10 // c[15] 7179.endm // dct_8x16_tx64_core_lsx 7180 7181.macro vmul_vssrarni_hw in0, in1, in2, tmp0, tmp1, out0, out1 7182 vsllwil.w.h vr22, \in0, 0 7183 vexth.w.h vr23, \in0 7184 vmul.w \tmp0, vr22, \in1 7185 vmul.w \out0, vr23, \in1 7186 vmul.w \tmp1, vr22, \in2 7187 vmul.w \out1, vr23, \in2 7188 vssrarni.h.w \out0, \tmp0, 12 7189 vssrarni.h.w \out1, \tmp1, 12 7190.endm 7191 7192const idct64_coeffs, align=4 7193 .word 101, 4095, 2967, -2824 7194 .word 1660, 3745, 3822, -1474 7195 .word 4076, 401, 4017, 799 7196 7197 .word 4036, -700, 2359, 3349 7198 .word 3461, -2191, 897, 3996 7199 .word -3166, -2598, -799, -4017 7200 7201 .word 501, 4065, 3229, -2520 7202 .word 2019, 3564, 3948, -1092 7203 .word 3612, 1931, 2276, 3406 7204 7205 .word 4085, -301, 2675, 3102 7206 .word 3659, -1842, 1285, 3889 7207 .word -3920, -1189, -3406, -2276 7208endconst 7209 7210// in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a 7211// in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a 7212// in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a 7213// in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a 7214 7215.macro dct64_step1_lsx 7216 7217 vldrepl.w vr20, t0, 0 // 101 7218 vldrepl.w vr21, t0, 4 // 4095 7219 vmul_vssrarni_hw vr0, vr20, vr21, vr16, vr0, vr8, vr9 // vr8 t32a vr9 t63a 7220 7221 vldrepl.w vr20, t0, 8 // 2967 7222 vldrepl.w vr21, t0, 12 // -2824 7223 vmul_vssrarni_hw vr1, vr20, vr21, vr16, vr1, vr10, vr11 // vr10 t62a vr11 t33a 7224 7225 vldrepl.w vr20, t0, 16 // 1660 7226 vldrepl.w vr21, t0, 20 // 3745 7227 vmul_vssrarni_hw vr2, vr20, vr21, vr16, vr2, vr12, vr13 // vr12 t34a vr13 t61a 7228 7229 vldrepl.w vr20, t0, 24 // 3822 7230 vldrepl.w vr21, t0, 28 // -1474 7231 vmul_vssrarni_hw vr3, vr20, vr21, vr16, vr3, vr14, vr15 // vr14 t60a vr15 t35a 7232 7233 vsadd.h vr0, vr8, vr11 // t32 7234 vssub.h vr1, vr8, vr11 // t33 7235 vssub.h vr2, vr15, vr12 // t34 7236 vsadd.h vr3, vr15, vr12 // t35 7237 vsadd.h vr4, vr14, vr13 // t60 7238 vssub.h vr5, vr14, vr13 // t61 7239 vssub.h vr6, vr9, vr10 // t62 7240 vsadd.h vr7, vr9, vr10 // t63 7241 7242 vldrepl.w vr20, t0, 32 // 4076 7243 vldrepl.w vr21, t0, 36 // 401 7244 vmul_vmadd_w vr6, vr1, vr20, vr21, vr9, vr10 7245 vmul_vmsub_w vr6, vr1, vr21, vr20, vr13, vr11 7246 vssrarni.h.w vr10, vr9, 12 // t62a 7247 vssrarni.h.w vr11, vr13, 12 // t33a 7248 7249 vmul_vmadd_w vr5, vr2, vr20, vr21, vr9, vr1 7250 vmul_vmsub_w vr5, vr2, vr21, vr20, vr13, vr6 7251 vneg.w vr9, vr9 7252 vneg.w vr1, vr1 7253 vssrarni.h.w vr6, vr13, 12 // t61a 7254 vssrarni.h.w vr1, vr9, 12 // t34a 7255 7256 vsadd.h vr2, vr0, vr3 // t32a 7257 vssub.h vr5, vr0, vr3 // t35a 7258 vsadd.h vr9, vr11, vr1 // t33 7259 vssub.h vr13, vr11, vr1 // t34 7260 vssub.h vr0, vr7, vr4 // t60a 7261 vsadd.h vr3, vr7, vr4 // t63a 7262 vssub.h vr1, vr10, vr6 // t61 7263 vsadd.h vr11, vr10, vr6 // t62 7264 7265 vldrepl.w vr20, t0, 40 // 4017 7266 vldrepl.w vr21, t0, 44 // 799 7267 7268 vmul_vmadd_w vr1, vr13, vr20, vr21, vr8, vr4 7269 vmul_vmsub_w vr1, vr13, vr21, vr20, vr12, vr7 7270 vssrarni.h.w vr4, vr8, 12 // t61a 7271 vssrarni.h.w vr7, vr12, 12 // t34a 7272 7273 vmul_vmadd_w vr0, vr5, vr20, vr21, vr8, vr6 7274 vmul_vmsub_w vr0, vr5, vr21, vr20, vr12, vr10 7275 vssrarni.h.w vr6, vr8, 12 // t60 7276 vssrarni.h.w vr10, vr12, 12 // t35 7277 7278 vst_x8 t6, 0, 16, vr2, vr9, vr7, vr10, vr6, vr4, vr11, vr3 7279.endm // dct64_step1 7280 7281 // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a 7282 // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a 7283 // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a 7284 // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a 7285.macro dct64_step2_lsx 7286 vld vr0, t5, 0 // t32a 7287 vld vr2, t4, 0 // t63a 7288 vld vr3, t5, 16*8 // t56a 7289 vld vr1, t4, 16*8 // t39a 7290 vld vr4, t5, 16*16 // t40a 7291 vld vr6, t4, 16*16 // t55a 7292 vld vr7, t5, 16*24 // t48a 7293 vld vr5, t4, 16*24 // t47a 7294 7295 vsadd.h vr8, vr0, vr1 // t32 7296 vssub.h vr9, vr0, vr1 // t39 7297 vsadd.h vr10, vr2, vr3 // t63 7298 vssub.h vr11, vr2, vr3 // t56 7299 vssub.h vr12, vr5, vr4 // t40 7300 vsadd.h vr13, vr5, vr4 // t47 7301 vsadd.h vr14, vr7, vr6 // t48 7302 vssub.h vr15, vr7, vr6 // t55 7303 7304 vldrepl.w vr20, t0, 8 // 1567 7305 vldrepl.w vr21, t0, 12 // 3784 7306 vmul_vmadd_w vr11, vr9, vr21, vr20, vr0, vr2 7307 vmul_vmsub_w vr11, vr9, vr20, vr21, vr1, vr3 7308 vssrarni.h.w vr2, vr0, 12 // t56a 7309 vssrarni.h.w vr3, vr1, 12 // t39a 7310 7311 vmul_vmadd_w vr15, vr12, vr21, vr20, vr0, vr4 7312 vmul_vmsub_w vr15, vr12, vr20, vr21, vr1, vr5 7313 vneg.w vr0, vr0 7314 vneg.w vr4, vr4 7315 vssrarni.h.w vr5, vr1, 12 // t55a 7316 vssrarni.h.w vr4, vr0, 12 // t40a 7317 7318 vsadd.h vr9, vr8, vr13 // t32a 7319 vssub.h vr11, vr8, vr13 // t47a 7320 vsadd.h vr6, vr3, vr4 // t39 7321 vssub.h vr7, vr3, vr4 // t40 7322 vssub.h vr12, vr10, vr14 // t48a 7323 vsadd.h vr15, vr10, vr14 // t63a 7324 vssub.h vr0, vr2, vr5 // t55 7325 vsadd.h vr1, vr2, vr5 // t56 7326 7327 vldrepl.w vr20, t0, 0 // 2896 7328 vmul_vmsub_w vr0, vr7, vr20, vr20, vr8, vr13 7329 vmul_vmadd_w vr0, vr7, vr20, vr20, vr3, vr4 7330 vssrarni.h.w vr13, vr8, 12 // t40a 7331 vssrarni.h.w vr4, vr3, 12 // t55a 7332 vmul_vmsub_w vr12, vr11, vr20, vr20, vr8, vr10 7333 vmul_vmadd_w vr12, vr11, vr20, vr20, vr3, vr14 7334 vssrarni.h.w vr10, vr8, 12 // t47 7335 vssrarni.h.w vr14, vr3, 12 // t48 7336 7337 // t32a t39 t40a t47 t48 t55a t56 t63a 7338 // vr9 vr6 vr13 vr10 vr14 vr4 vr1 vr15 7339 vst vr9, t5, 0 // t32a 7340 vst vr6, t4, 0 // t39 7341 vst vr13, t5, 16*8 // t40a 7342 vst vr10, t4, 16*8 // t47 7343 vst vr14, t5, 16*16 // t48 7344 vst vr4, t4, 16*16 // t55a 7345 vst vr1, t5, 16*24 // t56 7346 vst vr15, t4, 16*24 // t63a 7347.endm // dct64_step2_lsx 7348 7349.macro dct64_step3_lsx 7350 // t0 t1 t2 t3 t4 t5 t6 t7 7351 vld_x8 t3, 0, 16, vr2, vr3, vr7, vr8, vr11, vr12, vr16, vr17 7352 7353 vld vr9, t5, 16*24 // t56 7354 vld vr6, t5, 16*24+16 // t57a 7355 vld vr13, t5, 16*24+32 // t58 7356 vld vr10, t5, 16*24+48 // t59a 7357 vld vr14, t4, 16*24-48 // t60 7358 vld vr4, t4, 16*24-32 // t61a 7359 vld vr1, t4, 16*24-16 // t62 7360 vld vr15, t4, 16*24 // t63a 7361 7362 vsadd.h vr20, vr2, vr15 // c[0] 7363 vssub.h vr21, vr2, vr15 // c[63] 7364 vsadd.h vr22, vr3, vr1 // c[1] 7365 vssub.h vr23, vr3, vr1 // c[62] 7366 vsadd.h vr24, vr7, vr4 // c[2] 7367 vssub.h vr25, vr7, vr4 // c[61] 7368 vsadd.h vr26, vr8, vr14 // c[3] 7369 vssub.h vr27, vr8, vr14 // c[60] 7370 7371 vsadd.h vr28, vr11, vr10 // c[4] 7372 vssub.h vr29, vr11, vr10 // c[59] 7373 vsadd.h vr30, vr12, vr13 // c[5] 7374 vssub.h vr31, vr12, vr13 // c[58] 7375 vsadd.h vr2, vr16, vr6 // c[6] 7376 vssub.h vr15, vr16, vr6 // c[57] 7377 vsadd.h vr1, vr17, vr9 // c[7] 7378 vssub.h vr3, vr17, vr9 // c[56] 7379.endm // dct64_step3_lsx 7380 7381.macro dct64_step4_lsx transpose8x8, shift, start0, stride0, start1, stride1 7382 7383 dct64_step3_lsx 7384 7385.ifnb \transpose8x8 7386 LSX_TRANSPOSE8x8_H vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ 7387 vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ 7388 vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13 7389 7390 LSX_TRANSPOSE8x8_H vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \ 7391 vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \ 7392 vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13 7393.endif 7394 7395.ifnb \shift 7396.irp i, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ 7397 vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 7398 vsrari.h \i, \i, \shift 7399.endr 7400.endif 7401 7402 vst_x8 t7, \start0, \stride0, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 7403 7404 vst_x8 t7, \start1, \stride1, vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 7405 7406.endm // dct64_step4_lsx 7407 7408.macro dct64_step5_lsx in0, in1, in2, in3, in4, in5, in6, in7 7409 7410 fld.d f4, t0, 0 7411 fldx.d f5, t0, a1 7412 fld.d f6, t6, 0 7413 fldx.d f7, t6, a1 7414 alsl.d t0, a1, t0, 2 7415 alsl.d t6, a1, t6, 2 7416 fld.d f8, t0, 0 7417 fldx.d f9, t0, a1 7418 fld.d f10, t6, 0 7419 fldx.d f11, t6, a1 7420 7421.irp i, vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11 7422 vsllwil.hu.bu \i, \i, 0 7423.endr 7424 7425 vsrari.h vr20, \in0, 4 7426 vsrari.h vr22, \in1, 4 7427 vsrari.h vr24, \in2, 4 7428 vsrari.h vr26, \in3, 4 7429 vsrari.h vr28, \in4, 4 7430 vsrari.h vr30, \in5, 4 7431 vsrari.h vr2, \in6, 4 7432 vsrari.h vr1, \in7, 4 7433 7434 vadd.h vr4, vr4, vr20 7435 vadd.h vr5, vr5, vr22 7436 vadd.h vr6, vr6, vr24 7437 vadd.h vr7, vr7, vr26 7438 vadd.h vr8, vr8, vr28 7439 vadd.h vr9, vr9, vr30 7440 vadd.h vr10, vr10, vr2 7441 vadd.h vr11, vr11, vr1 7442 7443 vssrani.bu.h vr5, vr4, 0 7444 vssrani.bu.h vr7, vr6, 0 7445 vssrani.bu.h vr9, vr8, 0 7446 vssrani.bu.h vr11, vr10, 0 7447 7448 vstelm.d vr5, t1, 0, 0 7449 vstelm.d vr5, t2, 0, 1 7450 7451 alsl.d t1, a1, t1, 1 7452 alsl.d t2, a1, t2, 1 7453 vstelm.d vr7, t1, 0, 0 7454 vstelm.d vr7, t2, 0, 1 7455 7456 alsl.d t1, a1, t1, 1 7457 alsl.d t2, a1, t2, 1 7458 vstelm.d vr9, t1, 0, 0 7459 vstelm.d vr9, t2, 0, 1 7460 7461 alsl.d t1, a1, t1, 1 7462 alsl.d t2, a1, t2, 1 7463 vstelm.d vr11, t1, 0, 0 7464 vstelm.d vr11, t2, 0, 1 7465.endm // dct64_step5_lsx 7466 7467.macro dct_8x32_tx64_new_lsx vld_loc0, stride0, vld_loc1, stride1 7468 vld_x8 t2, \vld_loc0, \stride0, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 7469 7470 dct_8x16_tx64_core_lsx 7471 7472 vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 7473 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 7474 7475 vld_x8 t2, \vld_loc1, \stride1, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 7476 7477 la.local t0, idct_coeffs 7478 7479 vldrepl.w vr20, t0, 64 // 201 7480 vldrepl.w vr21, t0, 68 // 4091 7481 vsllwil.w.h vr22, vr0, 0 7482 vexth.w.h vr23, vr0 7483 vmul.w vr8, vr22, vr21 7484 vmul.w vr9, vr23, vr21 7485 vmul.w vr0, vr22, vr20 7486 vmul.w vr10, vr23, vr20 7487 vssrarni.h.w vr9, vr8, 12 // t31a 7488 vssrarni.h.w vr10, vr0, 12 // t16a 7489 7490 vldrepl.w vr20, t0, 72 // 3035 7491 vldrepl.w vr21, t0, 76 // 2751 7492 vsllwil.w.h vr22, vr7, 0 7493 vexth.w.h vr23, vr7 7494 vneg.w vr21, vr21 7495 vmul.w vr8, vr22, vr20 7496 vmul.w vr0, vr23, vr20 7497 vmul.w vr7, vr22, vr21 7498 vmul.w vr30, vr23, vr21 7499 vssrarni.h.w vr0, vr8, 12 // t30a 7500 vssrarni.h.w vr30, vr7, 12 // t17a 7501 7502 vldrepl.w vr20, t0, 80 // 1751 7503 vldrepl.w vr21, t0, 84 // 3703 7504 vsllwil.w.h vr22, vr4, 0 7505 vexth.w.h vr23, vr4 7506 vmul.w vr8, vr22, vr21 7507 vmul.w vr7, vr23, vr21 7508 vmul.w vr4, vr22, vr20 7509 vmul.w vr19, vr23, vr20 7510 vssrarni.h.w vr7, vr8, 12 // t29a 7511 vssrarni.h.w vr19, vr4, 12 // t18a 7512 7513 vldrepl.w vr20, t0, 88 // 3857 7514 vldrepl.w vr21, t0, 92 // 1380 7515 vsllwil.w.h vr22, vr3, 0 7516 vexth.w.h vr23, vr3 7517 vneg.w vr21, vr21 7518 vmul.w vr8, vr22, vr20 7519 vmul.w vr4, vr23, vr20 7520 vmul.w vr3, vr22, vr21 7521 vmul.w vr26, vr23, vr21 7522 vssrarni.h.w vr4, vr8, 12 // t28a 7523 vssrarni.h.w vr26, vr3, 12 // t19a 7524 7525 vldrepl.w vr20, t0, 96 // 995 7526 vldrepl.w vr21, t0, 100 // 3973 7527 vsllwil.w.h vr22, vr2, 0 7528 vexth.w.h vr23, vr2 7529 vmul.w vr8, vr22, vr21 7530 vmul.w vr3, vr23, vr21 7531 vmul.w vr2, vr22, vr20 7532 vmul.w vr27, vr23, vr20 7533 vssrarni.h.w vr3, vr8, 12 // t27a 7534 vssrarni.h.w vr27, vr2, 12 // t20a 7535 7536 vldrepl.w vr20, t0, 104 // 3513 7537 vldrepl.w vr21, t0, 108 // 2106 7538 vsllwil.w.h vr22, vr5, 0 7539 vexth.w.h vr23, vr5 7540 vneg.w vr21, vr21 7541 vmul.w vr8, vr22, vr20 7542 vmul.w vr2, vr23, vr20 7543 vmul.w vr5, vr22, vr21 7544 vmul.w vr28, vr23, vr21 7545 vssrarni.h.w vr2, vr8, 12 // t26a 7546 vssrarni.h.w vr28, vr5, 12 // t21a 7547 7548 vldrepl.w vr20, t0, 112 // 2440 -> 1220 7549 vldrepl.w vr21, t0, 116 // 3290 -> 1645 7550 vsllwil.w.h vr22, vr6, 0 7551 vexth.w.h vr23, vr6 7552 vmul.w vr8, vr22, vr21 7553 vmul.w vr5, vr23, vr21 7554 vmul.w vr6, vr22, vr20 7555 vmul.w vr25, vr23, vr20 7556 vssrarni.h.w vr5, vr8, 12 // t25a 7557 vssrarni.h.w vr25, vr6, 12 // t22a 7558 7559 vldrepl.w vr20, t0, 120 // 4052 7560 vldrepl.w vr21, t0, 124 // 601 7561 vsllwil.w.h vr22, vr1, 0 7562 vexth.w.h vr23, vr1 7563 vneg.w vr21, vr21 7564 vmul.w vr8, vr22, vr20 7565 vmul.w vr6, vr23, vr20 7566 vmul.w vr1, vr22, vr21 7567 vmul.w vr24, vr23, vr21 7568 vssrarni.h.w vr6, vr8, 12 // t24a 7569 vssrarni.h.w vr24, vr1, 12 // t23a 7570 7571 vsadd.h vr1, vr10, vr30 // t16 7572 vssub.h vr29, vr10, vr30 // t17 7573 vssub.h vr8, vr26, vr19 // t18 7574 vsadd.h vr31, vr26, vr19 // t19 7575 vsadd.h vr10, vr27, vr28 // t20 7576 vssub.h vr30, vr27, vr28 // t21 7577 vssub.h vr19, vr24, vr25 // t22 7578 vsadd.h vr26, vr24, vr25 // t23 7579 vsadd.h vr27, vr6, vr5 // t24 7580 vssub.h vr28, vr6, vr5 // t25 7581 vssub.h vr24, vr3, vr2 // t26 7582 vsadd.h vr25, vr3, vr2 // t27 7583 vsadd.h vr5, vr4, vr7 // t28 7584 vssub.h vr6, vr4, vr7 // t29 7585 vssub.h vr2, vr9, vr0 // t30 7586 vsadd.h vr3, vr9, vr0 // t31 7587 7588 vldrepl.w vr20, t0, 16 // 799 7589 vldrepl.w vr21, t0, 20 // 4017 7590 vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 7591 vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0 7592 vssrarni.h.w vr7, vr4, 12 // t30a 7593 vssrarni.h.w vr0, vr11, 12 // t17a 7594 vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 7595 vneg.w vr4, vr4 7596 vneg.w vr9, vr9 7597 vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2 7598 vssrarni.h.w vr9, vr4, 12 // t18a 7599 vssrarni.h.w vr2, vr11, 12 // t29a 7600 7601 vldrepl.w vr20, t0, 24 // 3406 -> 1703 7602 vldrepl.w vr21, t0, 28 // 2276 -> 1138 7603 vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 7604 vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6 7605 vssrarni.h.w vr29, vr4, 12 // t26a 7606 vssrarni.h.w vr6, vr11, 12 // t21a 7607 7608 vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 7609 vneg.w vr4, vr4 7610 vneg.w vr8, vr8 7611 vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24 7612 vssrarni.h.w vr8, vr4, 12 // t22a 7613 vssrarni.h.w vr24, vr11, 12 // t25a 7614 7615 vsadd.h vr4, vr1, vr31 // t16a 7616 vssub.h vr30, vr1, vr31 // t19a 7617 vsadd.h vr19, vr0, vr9 // t17 7618 vssub.h vr28, vr0, vr9 // t18 7619 vssub.h vr1, vr26, vr10 // t20a 7620 vsadd.h vr31, vr26, vr10 // t23a 7621 vssub.h vr0, vr8, vr6 // t21 7622 vsadd.h vr9, vr8, vr6 // t22 7623 vsadd.h vr10, vr27, vr25 // t24a 7624 vssub.h vr26, vr27, vr25 // t27a 7625 vsadd.h vr6, vr24, vr29 // t25 7626 vssub.h vr8, vr24, vr29 // t26 7627 vssub.h vr25, vr3, vr5 // t28a 7628 vsadd.h vr27, vr3, vr5 // t31a 7629 vssub.h vr24, vr7, vr2 // t29 7630 vsadd.h vr29, vr7, vr2 // t30 7631 7632 vldrepl.w vr20, t0, 8 // 1567 7633 vldrepl.w vr21, t0, 12 // 3784 7634 vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 7635 vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2 7636 vssrarni.h.w vr5, vr3, 12 // t29a 7637 vssrarni.h.w vr2, vr11, 12 // 18a 7638 7639 vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 7640 vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24 7641 vssrarni.h.w vr7, vr3, 12 // t28 7642 vssrarni.h.w vr24, vr11, 12 // t19 7643 7644 vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 7645 vneg.w vr3, vr3 7646 vneg.w vr28, vr28 7647 vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25 7648 vssrarni.h.w vr28, vr3, 12 // t20 7649 vssrarni.h.w vr25, vr11, 12 // t27 7650 7651 vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 7652 vneg.w vr3, vr3 7653 vneg.w vr30, vr30 7654 vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1 7655 vssrarni.h.w vr30, vr3, 12 // t21a 7656 vssrarni.h.w vr1, vr11, 12 // t26a 7657 7658 vsadd.h vr3, vr4, vr31 // t16 7659 vssub.h vr26, vr4, vr31 // t23 7660 vsadd.h vr0, vr19, vr9 // t17a 7661 vssub.h vr8, vr19, vr9 // t22a 7662 vsadd.h vr4, vr2, vr30 // t18 7663 vssub.h vr31, vr2, vr30 // t21 7664 vsadd.h vr9, vr24, vr28 // t19a 7665 vssub.h vr19, vr24, vr28 // t20a 7666 vssub.h vr2, vr27, vr10 // t24 7667 vsadd.h vr30, vr27, vr10 // t31 7668 vssub.h vr24, vr29, vr6 // t25a 7669 vsadd.h vr28, vr29, vr6 // t30a 7670 vssub.h vr10, vr5, vr1 // t26 7671 vsadd.h vr27, vr5, vr1 // t29 7672 vssub.h vr6, vr7, vr25 // t27a 7673 vsadd.h vr29, vr7, vr25 // t28a 7674 7675 vldrepl.w vr20, t0, 0 // 2896 7676 vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 7677 vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7 7678 vssrarni.h.w vr5, vr1, 12 // t20 7679 vssrarni.h.w vr7, vr11, 12 // t27 7680 7681 vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 7682 vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6 7683 vssrarni.h.w vr25, vr1, 12 // t21a 7684 vssrarni.h.w vr6, vr11, 12 // t26a 7685 7686 vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 7687 vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10 7688 vssrarni.h.w vr19, vr1, 12 // t22 7689 vssrarni.h.w vr10, vr11, 12 // t25 7690 7691 vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 7692 vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8 7693 vssrarni.h.w vr31, vr1, 12 // t23a 7694 vssrarni.h.w vr8, vr11, 12 // t24a 7695 7696 // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 7697 // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 7698 7699 vld_x8 t3, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 7700 7701 vsadd.h vr1, vr11, vr30 // c[0] 7702 vssub.h vr2, vr11, vr30 // c[31] 7703 vsadd.h vr24, vr12, vr28 // c[1] 7704 vssub.h vr26, vr12, vr28 // c[30] 7705 vsadd.h vr11, vr13, vr27 // c[2] 7706 vssub.h vr30, vr13, vr27 // c[29] 7707 vsadd.h vr12, vr14, vr29 // c[3] 7708 vssub.h vr28, vr14, vr29 // c[28] 7709 vsadd.h vr13, vr15, vr7 // c[4] 7710 vssub.h vr27, vr15, vr7 // c[27] 7711 vsadd.h vr14, vr16, vr6 // c[5] 7712 vssub.h vr29, vr16, vr6 // c[26] 7713 vsadd.h vr7, vr17, vr10 // c[6] 7714 vssub.h vr15, vr17, vr10 // c[25] 7715 vsadd.h vr6, vr18, vr8 // c[7] 7716 vssub.h vr16, vr18, vr8 // c[24] 7717 7718 vst_x8 t3, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 7719 7720 vst_x8 t3, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 7721 7722 vld_x8 t3, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 7723 7724 vsadd.h vr1, vr11, vr31 // c[8] 7725 vssub.h vr2, vr11, vr31 // c[23] 7726 vsadd.h vr24, vr12, vr19 // c[9] 7727 vssub.h vr26, vr12, vr19 // c[22] 7728 vsadd.h vr11, vr13, vr25 // c[10] 7729 vssub.h vr30, vr13, vr25 // c[21] 7730 vsadd.h vr12, vr14, vr5 // c[11] 7731 vssub.h vr28, vr14, vr5 // c[20] 7732 vsadd.h vr13, vr15, vr9 // c[12] 7733 vssub.h vr27, vr15, vr9 // c[19] 7734 vsadd.h vr14, vr16, vr4 // c[13] 7735 vssub.h vr29, vr16, vr4 // c[18] 7736 vsadd.h vr7, vr17, vr0 // c[14] 7737 vssub.h vr15, vr17, vr0 // c[17] 7738 vsadd.h vr6, vr18, vr3 // c[15] 7739 vssub.h vr16, vr18, vr3 // c[16] 7740 7741 vst_x8 t3, 128, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 7742 7743 vst_x8 t3, 256, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 7744.endm // dct_8x32_tx64_new_lsx 7745 7746function inv_txfm_add_dct_dct_64x64_8bpc_lsx 7747 bnez a3, .NO_HAS_DCONLY_64x64 7748 7749 ld.h t2, a2, 0 7750 vldi vr0, 0x8b5 7751 vreplgr2vr.w vr1, t2 7752 vldi vr20, 0x880 7753 vmul.w vr2, vr0, vr1 7754 st.h zero, a2, 0 7755 vsrari.w vr2, vr2, 8 7756 vld vr3, a0, 48 7757 vsrari.w vr2, vr2, 2 7758 vld vr1, a0, 16 7759 vmadd.w vr20, vr2, vr0 7760 vld vr2, a0, 32 7761 vssrarni.h.w vr20, vr20, 12 7762 vld vr0, a0, 0 7763 7764 vsllwil.hu.bu vr4, vr0, 0 7765 vsllwil.hu.bu vr5, vr1, 0 7766 vsllwil.hu.bu vr6, vr2, 0 7767 vsllwil.hu.bu vr7, vr3, 0 7768 vexth.hu.bu vr0, vr0 7769 vexth.hu.bu vr1, vr1 7770 vexth.hu.bu vr2, vr2 7771 vexth.hu.bu vr3, vr3 7772 vadd.h vr8, vr4, vr20 7773 vadd.h vr9, vr0, vr20 7774 vadd.h vr10, vr5, vr20 7775 vadd.h vr11, vr1, vr20 7776 vadd.h vr12, vr6, vr20 7777 vadd.h vr13, vr2, vr20 7778 vadd.h vr14, vr7, vr20 7779 vadd.h vr15, vr3, vr20 7780 vssrani.bu.h vr9, vr8, 0 7781 vssrani.bu.h vr11, vr10, 0 7782 vssrani.bu.h vr13, vr12, 0 7783 vssrani.bu.h vr15, vr14, 0 7784 vst vr9, a0, 0 7785 vst vr11, a0, 16 7786 vst vr13, a0, 32 7787 vst vr15, a0, 48 7788 7789.rept 63 7790 add.d a0, a0, a1 7791 vld vr0, a0, 0 7792 vld vr1, a0, 16 7793 vld vr2, a0, 32 7794 vld vr3, a0, 48 7795 vsllwil.hu.bu vr4, vr0, 0 7796 vsllwil.hu.bu vr5, vr1, 0 7797 vsllwil.hu.bu vr6, vr2, 0 7798 vsllwil.hu.bu vr7, vr3, 0 7799 vexth.hu.bu vr0, vr0 7800 vexth.hu.bu vr1, vr1 7801 vexth.hu.bu vr2, vr2 7802 vexth.hu.bu vr3, vr3 7803 vadd.h vr8, vr4, vr20 7804 vadd.h vr9, vr0, vr20 7805 vadd.h vr10, vr5, vr20 7806 vadd.h vr11, vr1, vr20 7807 vadd.h vr12, vr6, vr20 7808 vadd.h vr13, vr2, vr20 7809 vadd.h vr14, vr7, vr20 7810 vadd.h vr15, vr3, vr20 7811 vssrani.bu.h vr9, vr8, 0 7812 vssrani.bu.h vr11, vr10, 0 7813 vssrani.bu.h vr13, vr12, 0 7814 vssrani.bu.h vr15, vr14, 0 7815 vst vr9, a0, 0 7816 vst vr11, a0, 16 7817 vst vr13, a0, 32 7818 vst vr15, a0, 48 7819.endr 7820 b .DCT_DCT_64X64_END 7821.NO_HAS_DCONLY_64x64: 7822 7823 malloc_space 64*32*2+512+512 7824 7825 addi.d t7, sp, 64 7826 7827.macro dct64x64_core1_lsx in0, in1, in2 7828 addi.d t2, a2, \in0 7829 addi.d t7, t7, \in1 7830 li.w t4, 64*32*2+64 7831 add.d t3, sp, t4 7832 addi.d t6, t3, 512 7833 add.d t5, t6, zero 7834 7835 dct_8x32_tx64_new_lsx 0, 256, 128, 256 7836 7837 la.local t0, idct64_coeffs 7838 7839 addi.d t2, a2, \in2 // 32 ... 7840 // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a 7841 vld vr0, t2, 128*0 // in1 7842 vld vr1, t2, 128*15 // in31 7843 vld vr2, t2, 128*8 // in17 7844 vld vr3, t2, 128*7 // in15 7845 dct64_step1_lsx 7846 7847 addi.d t0, t0, 48 7848 addi.d t6, t6, 128 7849 // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a 7850 vld vr0, t2, 128*3 // in7 7851 vld vr1, t2, 128*12 // in25 7852 vld vr2, t2, 128*11 // in23 7853 vld vr3, t2, 128*4 // in9 7854 dct64_step1_lsx 7855 7856 addi.d t0, t0, 48 7857 addi.d t6, t6, 128 7858 // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a 7859 vld vr0, t2, 128*2 // in5 7860 vld vr1, t2, 128*13 // in27 7861 vld vr2, t2, 128*10 // in21 7862 vld vr3, t2, 128*5 // in11 7863 dct64_step1_lsx 7864 7865 addi.d t0, t0, 48 7866 addi.d t6, t6, 128 7867 // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a 7868 vld vr0, t2, 128*1 // in3 7869 vld vr1, t2, 128*14 // in29 7870 vld vr2, t2, 128*9 // in19 7871 vld vr3, t2, 128*6 // in13 7872 dct64_step1_lsx 7873 7874 la.local t0, idct_coeffs 7875 addi.d t4, t5, 16*7 7876 // t32a/t39/t40a/t47/t48/t55a/t56/t63a 7877 dct64_step2_lsx 7878 7879 addi.d t5, t5, 16 7880 addi.d t4, t4, -16 7881 // t33/t38a/t41/t46a/t49a/t54/t57a/t62 7882 dct64_step2_lsx 7883 7884 addi.d t5, t5, 16 7885 addi.d t4, t4, -16 7886 // t34a/t37/t42a/t45/t50/t53a/t58/t61a 7887 dct64_step2_lsx 7888 7889 addi.d t5, t5, 16 7890 addi.d t4, t4, -16 7891 // t35/t36a/t43/t44a/t51a/t52/t59a/t60 7892 dct64_step2_lsx 7893 7894 li.w t4, 64*32*2+64+512 7895 add.d t5, t4, sp 7896 addi.d t4, t5, 16*7 7897 dct64_step4_lsx transpose8x8, 2, 0, 128, 112, 128 7898 7899 addi.d t3, t3, 128 7900 addi.d t4, t4, -16*8 7901 addi.d t5, t5, -16*8 7902 dct64_step4_lsx transpose8x8, 2, 16, 128, 96, 128 7903 7904 addi.d t5, t5, -16*8 7905 addi.d t4, t4, -16*8 7906 addi.d t3, t3, 128 7907 dct64_step4_lsx transpose8x8, 2, 32, 128, 80, 128 7908 7909 addi.d t5, t5, -16*8 7910 addi.d t4, t4, -16*8 7911 addi.d t3, t3, 128 7912 dct64_step4_lsx transpose8x8, 2, 48, 128, 64, 128 7913.endm 7914 7915 dct64x64_core1_lsx 0, 0, 64 7916 7917 dct64x64_core1_lsx 16, 128*8, 64+16 7918 7919 dct64x64_core1_lsx 32, 128*8, 64+16*2 7920 7921 dct64x64_core1_lsx 48, 128*8, 64+16*3 7922 7923 vreplgr2vr.h vr31, zero 7924.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, 1040, 1056, 1072, 1088, 1104, 1120, 1136, 1152, 1168, 1184, 1200, 1216, 1232, 1248, 1264, 1280, 1296, 1312, 1328, 1344, 1360, 1376, 1392, 1408, 1424, 1440, 1456, 1472, 1488, 1504, 1520, 1536, 1552, 1568, 1584, 1600, 1616, 1632, 1648, 1664, 1680, 1696, 1712, 1728, 1744, 1760, 1776, 1792, 1808, 1824, 1840, 1856, 1872, 1888, 1904, 1920, 1936, 1952, 1968, 1984, 2000, 2016, 2032 7925 vst vr31, a2, \i 7926.endr 7927 7928.macro dct64x64_core2_lsx in0, in1 7929 addi.d t2, sp, 64+\in0 7930 addi.d t7, sp, 64+\in0 7931 li.w t4, 64*32*2+64 7932 add.d t3, sp, t4 7933 addi.d t6, t3, 512 7934 add.d t5, t6, zero 7935 7936 addi.d t2, t2, 1024 7937 addi.d t2, t2, 1024 7938 dct_8x32_tx64_new_lsx -2048, 512, 256-2048, 512 7939 7940 la.local t0, idct64_coeffs 7941 7942 addi.d t2, sp, 64+64*2+\in0 7943 addi.d t4, t2, 256*7 7944 addi.d t4, t4, 256 7945 7946 vld vr0, t2, 256*0 // in1 7947 vld vr1, t4, 256*7 // in31 7948 vld vr2, t4, 256*0 // in17 7949 vld vr3, t2, 256*7 // in15 7950 dct64_step1_lsx 7951 7952 addi.d t0, t0, 48 7953 addi.d t6, t6, 128 7954 vld vr0, t2, 256*3 // in7 7955 vld vr1, t4, 256*4 // in25 7956 vld vr2, t4, 256*3 // in23 7957 vld vr3, t2, 256*4 // in9 7958 dct64_step1_lsx 7959 7960 addi.d t0, t0, 48 7961 addi.d t6, t6, 128 7962 vld vr0, t2, 256*2 // in5 7963 vld vr1, t4, 256*5 // in27 7964 vld vr2, t4, 256*2 // in21 7965 vld vr3, t2, 256*5 // in11 7966 dct64_step1_lsx 7967 7968 addi.d t0, t0, 48 7969 addi.d t6, t6, 128 7970 vld vr0, t2, 256*1 // in3 7971 vld vr1, t4, 256*6 // in29 7972 vld vr2, t4, 256*1 // in19 7973 vld vr3, t2, 256*6 // in13 7974 dct64_step1_lsx 7975 7976 la.local t0, idct_coeffs 7977 addi.d t4, t5, 16*7 7978 // t32a/t39/t40a/t47/t48/t55a/t56/t63a 7979 dct64_step2_lsx 7980 7981 addi.d t5, t5, 16 7982 addi.d t4, t4, -16 7983 // t33/t38a/t41/t46a/t49a/t54/t57a/t62 7984 dct64_step2_lsx 7985 7986 addi.d t5, t5, 16 7987 addi.d t4, t4, -16 7988 // t34a/t37/t42a/t45/t50/t53a/t58/t61a 7989 dct64_step2_lsx 7990 7991 addi.d t5, t5, 16 7992 addi.d t4, t4, -16 7993 // t35/t36a/t43/t44a/t51a/t52/t59a/t60 7994 dct64_step2_lsx 7995 7996 li.w t4, 64*32*2+64+512 7997 add.d t5, t4, sp 7998 addi.d t4, t5, 16*7 7999 addi.d a0, a0, \in1 8000 // 0 - 7, 56 -63 8001 dct64_step3_lsx 8002 8003 li.w t8, 0 8004 mul.w t0, t8, a1 8005 add.d t0, a0, t0 8006 alsl.d t6, a1, t0, 1 8007 addi.d t1, t0, 0 8008 add.d t2, t0, a1 8009 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 8010 8011 li.w t8, 56 8012 mul.w t0, t8, a1 8013 add.d t0, a0, t0 8014 alsl.d t6, a1, t0, 1 8015 addi.d t1, t0, 0 8016 add.d t2, t0, a1 8017 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 8018 8019 // 8 - 15, 48 - 55 8020 addi.d t3, t3, 128 8021 addi.d t4, t4, -16*8 8022 addi.d t5, t5, -16*8 8023 dct64_step3_lsx 8024 8025 li.w t8, 8 8026 mul.w t0, t8, a1 8027 add.d t0, t0, a0 8028 alsl.d t6, a1, t0, 1 8029 addi.d t1, t0, 0 8030 add.d t2, t0, a1 8031 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 8032 8033 li.w t8, 48 8034 mul.w t0, t8, a1 8035 add.d t0, t0, a0 8036 alsl.d t6, a1, t0, 1 8037 addi.d t1, t0, 0 8038 add.d t2, t0, a1 8039 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 8040 8041 // 16 - 23, 40 - 47 8042 addi.d t3, t3, 128 8043 addi.d t4, t4, -16*8 8044 addi.d t5, t5, -16*8 8045 dct64_step3_lsx 8046 8047 li.w t8, 16 8048 mul.w t0, t8, a1 8049 add.d t0, t0, a0 8050 alsl.d t6, a1, t0, 1 8051 addi.d t1, t0, 0 8052 add.d t2, t0, a1 8053 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 8054 8055 li.w t8, 40 8056 mul.w t0, t8, a1 8057 add.d t0, t0, a0 8058 alsl.d t6, a1, t0, 1 8059 addi.d t1, t0, 0 8060 add.d t2, t0, a1 8061 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 8062 8063 // 24 - 31, 32 - 39 8064 addi.d t3, t3, 128 8065 addi.d t4, t4, -16*8 8066 addi.d t5, t5, -16*8 8067 dct64_step3_lsx 8068 8069 li.w t8, 24 8070 mul.w t0, t8, a1 8071 add.d t0, t0, a0 8072 alsl.d t6, a1, t0, 1 8073 addi.d t1, t0, 0 8074 add.d t2, t0, a1 8075 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 8076 8077 li.w t8, 32 8078 mul.w t0, t8, a1 8079 add.d t0, t0, a0 8080 alsl.d t6, a1, t0, 1 8081 addi.d t1, t0, 0 8082 add.d t2, t0, a1 8083 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 8084.endm 8085 8086 dct64x64_core2_lsx 16*0, 0 8087 8088 dct64x64_core2_lsx 16*1, 8 8089 8090 dct64x64_core2_lsx 16*2, 8 8091 8092 dct64x64_core2_lsx 16*3, 8 8093 8094 dct64x64_core2_lsx 16*4, 8 8095 8096 dct64x64_core2_lsx 16*5, 8 8097 8098 dct64x64_core2_lsx 16*6, 8 8099 8100 dct64x64_core2_lsx 16*7, 8 8101 8102 free_space 64*32*2+512+512 8103.DCT_DCT_64X64_END: 8104endfunc 8105