1/* 2 * Copyright © 2023, VideoLAN and dav1d authors 3 * Copyright © 2023, Loongson Technology Corporation Limited 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/loongarch/loongson_asm.S" 29 30#define REST_UNIT_STRIDE (400) 31 32.macro MADD_HU_BU in0, in1, out0, out1 33 vsllwil.hu.bu vr12, \in0, 0 34 vexth.hu.bu vr13, \in0 35 vmadd.h \out0, vr12, \in1 36 vmadd.h \out1, vr13, \in1 37.endm 38 39const wiener_shuf 40.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 41endconst 42 43/* 44void wiener_filter_h_lsx(int32_t *hor_ptr, 45 uint8_t *tmp_ptr, 46 const int16_t filterh[8], 47 const int w, const int h) 48*/ 49function wiener_filter_h_8bpc_lsx 50 addi.d sp, sp, -40 51 fst.d f24, sp, 0 52 fst.d f25, sp, 8 53 fst.d f26, sp, 16 54 fst.d f27, sp, 24 55 fst.d f28, sp, 32 56 li.w t7, 1<<14 // clip_limit 57 58 la.local t1, wiener_shuf 59 vld vr4, t1, 0 60 vld vr14, a2, 0 // filter[0][k] 61 vreplvei.h vr21, vr14, 0 62 vreplvei.h vr22, vr14, 1 63 vreplvei.h vr23, vr14, 2 64 vreplvei.h vr24, vr14, 3 65 vreplvei.h vr25, vr14, 4 66 vreplvei.h vr26, vr14, 5 67 vreplvei.h vr27, vr14, 6 68 vreplgr2vr.w vr0, t7 69 70.WIENER_FILTER_H_H: 71 addi.w a4, a4, -1 // h 72 addi.w t0, a3, 0 // w 73 addi.d t1, a1, 0 // tmp_ptr 74 addi.d t2, a0, 0 // hor_ptr 75 76.WIENER_FILTER_H_W: 77 addi.w t0, t0, -16 78 vld vr5, t1, 0 79 vld vr13, t1, 16 80 81 vsubi.bu vr14, vr4, 2 82 vsubi.bu vr15, vr4, 1 83 vshuf.b vr6, vr13, vr5, vr14 // 1 ... 8, 9 ... 16 84 vshuf.b vr7, vr13, vr5, vr15 // 2 ... 9, 10 ... 17 85 vshuf.b vr8, vr13, vr5, vr4 // 3 ... 10, 11 ... 18 86 vaddi.bu vr14, vr4, 1 87 vaddi.bu vr15, vr4, 2 88 vshuf.b vr9, vr13, vr5, vr14 // 4 ... 11, 12 ... 19 89 vshuf.b vr10, vr13, vr5, vr15 // 5 ... 12, 13 ... 20 90 vaddi.bu vr14, vr4, 3 91 vshuf.b vr11, vr13, vr5, vr14 // 6 ... 13, 14 ... 21 92 93 vsllwil.hu.bu vr15, vr8, 0 // 3 4 5 6 7 8 9 10 94 vexth.hu.bu vr16, vr8 // 11 12 13 14 15 16 17 18 95 vsllwil.wu.hu vr17, vr15, 0 // 3 4 5 6 96 vexth.wu.hu vr18, vr15 // 7 8 9 10 97 vsllwil.wu.hu vr19, vr16, 0 // 11 12 13 14 98 vexth.wu.hu vr20, vr16 // 15 16 17 18 99 vslli.w vr17, vr17, 7 100 vslli.w vr18, vr18, 7 101 vslli.w vr19, vr19, 7 102 vslli.w vr20, vr20, 7 103 vxor.v vr15, vr15, vr15 104 vxor.v vr14, vr14, vr14 105 106 MADD_HU_BU vr5, vr21, vr14, vr15 107 MADD_HU_BU vr6, vr22, vr14, vr15 108 MADD_HU_BU vr7, vr23, vr14, vr15 109 MADD_HU_BU vr8, vr24, vr14, vr15 110 MADD_HU_BU vr9, vr25, vr14, vr15 111 MADD_HU_BU vr10, vr26, vr14, vr15 112 MADD_HU_BU vr11, vr27, vr14, vr15 113 114 vsllwil.w.h vr5, vr14, 0 // 0 1 2 3 115 vexth.w.h vr6, vr14 // 4 5 6 7 116 vsllwil.w.h vr7, vr15, 0 // 8 9 10 11 117 vexth.w.h vr8, vr15 // 12 13 14 15 118 vadd.w vr17, vr17, vr5 119 vadd.w vr18, vr18, vr6 120 vadd.w vr19, vr19, vr7 121 vadd.w vr20, vr20, vr8 122 vadd.w vr17, vr17, vr0 123 vadd.w vr18, vr18, vr0 124 vadd.w vr19, vr19, vr0 125 vadd.w vr20, vr20, vr0 126 127 vsrli.w vr1, vr0, 1 128 vsubi.wu vr1, vr1, 1 129 vxor.v vr3, vr3, vr3 130 vsrari.w vr17, vr17, 3 131 vsrari.w vr18, vr18, 3 132 vsrari.w vr19, vr19, 3 133 vsrari.w vr20, vr20, 3 134 vclip.w vr17, vr17, vr3, vr1 135 vclip.w vr18, vr18, vr3, vr1 136 vclip.w vr19, vr19, vr3, vr1 137 vclip.w vr20, vr20, vr3, vr1 138 139 vst vr17, t2, 0 140 vst vr18, t2, 16 141 vst vr19, t2, 32 142 vst vr20, t2, 48 143 addi.d t1, t1, 16 144 addi.d t2, t2, 64 145 blt zero, t0, .WIENER_FILTER_H_W 146 147 addi.d a1, a1, REST_UNIT_STRIDE 148 addi.d a0, a0, (REST_UNIT_STRIDE << 2) 149 bnez a4, .WIENER_FILTER_H_H 150 151 fld.d f24, sp, 0 152 fld.d f25, sp, 8 153 fld.d f26, sp, 16 154 fld.d f27, sp, 24 155 fld.d f28, sp, 32 156 addi.d sp, sp, 40 157endfunc 158 159.macro APPLY_FILTER in0, in1, in2 160 alsl.d t7, \in0, \in1, 2 161 vld vr10, t7, 0 162 vld vr11, t7, 16 163 vld vr12, t7, 32 164 vld vr13, t7, 48 165 vmadd.w vr14, vr10, \in2 166 vmadd.w vr15, vr11, \in2 167 vmadd.w vr16, vr12, \in2 168 vmadd.w vr17, vr13, \in2 169.endm 170 171.macro wiener_filter_v_8bpc_core_lsx 172 vreplgr2vr.w vr14, t6 173 vreplgr2vr.w vr15, t6 174 vreplgr2vr.w vr16, t6 175 vreplgr2vr.w vr17, t6 176 177 addi.w t7, t2, 0 // j + index k 178 mul.w t7, t7, t8 // (j + index) * REST_UNIT_STRIDE 179 add.w t7, t7, t4 // (j + index) * REST_UNIT_STRIDE + i 180 181 APPLY_FILTER t7, a2, vr2 182 APPLY_FILTER t8, t7, vr3 183 APPLY_FILTER t8, t7, vr4 184 APPLY_FILTER t8, t7, vr5 185 APPLY_FILTER t8, t7, vr6 186 APPLY_FILTER t8, t7, vr7 187 APPLY_FILTER t8, t7, vr8 188 vssrarni.hu.w vr15, vr14, 11 189 vssrarni.hu.w vr17, vr16, 11 190 vssrlni.bu.h vr17, vr15, 0 191.endm 192 193/* 194void wiener_filter_v_lsx(uint8_t *p, 195 const ptrdiff_t p_stride, 196 const int32_t *hor, 197 const int16_t filterv[8], 198 const int w, const int h) 199*/ 200function wiener_filter_v_8bpc_lsx 201 li.w t6, -(1 << 18) 202 203 li.w t8, REST_UNIT_STRIDE 204 ld.h t0, a3, 0 205 ld.h t1, a3, 2 206 vreplgr2vr.w vr2, t0 207 vreplgr2vr.w vr3, t1 208 ld.h t0, a3, 4 209 ld.h t1, a3, 6 210 vreplgr2vr.w vr4, t0 211 vreplgr2vr.w vr5, t1 212 ld.h t0, a3, 8 213 ld.h t1, a3, 10 214 vreplgr2vr.w vr6, t0 215 vreplgr2vr.w vr7, t1 216 ld.h t0, a3, 12 217 vreplgr2vr.w vr8, t0 218 219 andi t1, a4, 0xf 220 sub.w t0, a4, t1 // w-w%16 221 or t2, zero, zero // j 222 or t4, zero, zero 223 beqz t0, .WIENER_FILTER_V_W_LT16 224 225.WIENER_FILTER_V_H: 226 andi t1, a4, 0xf 227 add.d t3, zero, a0 // p 228 or t4, zero, zero // i 229 230.WIENER_FILTER_V_W: 231 232 wiener_filter_v_8bpc_core_lsx 233 234 mul.w t5, t2, a1 // j * stride 235 add.w t5, t5, t4 // j * stride + i 236 add.d t3, a0, t5 237 addi.w t4, t4, 16 238 vst vr17, t3, 0 239 bne t0, t4, .WIENER_FILTER_V_W 240 241 beqz t1, .WIENER_FILTER_V_W_EQ16 242 243 wiener_filter_v_8bpc_core_lsx 244 245 addi.d t3, t3, 16 246 andi t1, a4, 0xf 247 248.WIENER_FILTER_V_ST_REM: 249 vstelm.b vr17, t3, 0, 0 250 vbsrl.v vr17, vr17, 1 251 addi.d t3, t3, 1 252 addi.w t1, t1, -1 253 bnez t1, .WIENER_FILTER_V_ST_REM 254.WIENER_FILTER_V_W_EQ16: 255 addi.w t2, t2, 1 256 blt t2, a5, .WIENER_FILTER_V_H 257 b .WIENER_FILTER_V_END 258 259.WIENER_FILTER_V_W_LT16: 260 andi t1, a4, 0xf 261 add.d t3, zero, a0 262 263 wiener_filter_v_8bpc_core_lsx 264 265 mul.w t5, t2, a1 // j * stride 266 add.d t3, a0, t5 267 268.WIENER_FILTER_V_ST_REM_1: 269 vstelm.b vr17, t3, 0, 0 270 vbsrl.v vr17, vr17, 1 271 addi.d t3, t3, 1 272 addi.w t1, t1, -1 273 bnez t1, .WIENER_FILTER_V_ST_REM_1 274 275 addi.w t2, t2, 1 276 blt t2, a5, .WIENER_FILTER_V_W_LT16 277 278.WIENER_FILTER_V_END: 279endfunc 280 281/* 282void boxsum3_h(int32_t *sumsq, coef *sum, const pixel *src, 283 const int w, const int h) 284*/ 285function boxsum3_h_8bpc_lsx 286 addi.d a2, a2, REST_UNIT_STRIDE 287 li.w t0, 1 288 addi.w a3, a3, -2 289 addi.w a4, a4, -4 290 291.LBS3_H_H: 292 alsl.d t1, t0, a1, 1 // sum_v *sum_v = sum + x 293 alsl.d t2, t0, a0, 2 // sumsq_v *sumsq_v = sumsq + x 294 add.d t3, t0, a2 // s 295 addi.w t5, a3, 0 296.LBS3_H_W: 297 vld vr0, t3, 0 298 vld vr1, t3, REST_UNIT_STRIDE 299 vld vr2, t3, (REST_UNIT_STRIDE<<1) 300 301 vilvl.b vr3, vr1, vr0 302 vhaddw.hu.bu vr4, vr3, vr3 303 vilvh.b vr5, vr1, vr0 304 vhaddw.hu.bu vr6, vr5, vr5 305 vsllwil.hu.bu vr7, vr2, 0 306 vexth.hu.bu vr8, vr2 307 // sum_v 308 vadd.h vr4, vr4, vr7 309 vadd.h vr6, vr6, vr8 310 vst vr4, t1, REST_UNIT_STRIDE<<1 311 vst vr6, t1, (REST_UNIT_STRIDE<<1)+16 312 addi.d t1, t1, 32 313 // sumsq 314 vmulwev.h.bu vr9, vr3, vr3 315 vmulwod.h.bu vr10, vr3, vr3 316 vmulwev.h.bu vr11, vr5, vr5 317 vmulwod.h.bu vr12, vr5, vr5 318 vmul.h vr7, vr7, vr7 319 vmul.h vr8, vr8, vr8 320 vaddwev.w.hu vr13, vr10, vr9 321 vaddwod.w.hu vr14, vr10, vr9 322 vilvl.w vr3, vr14, vr13 323 vilvh.w vr4, vr14, vr13 324 vaddwev.w.hu vr13, vr12, vr11 325 vaddwod.w.hu vr14, vr12, vr11 326 vilvl.w vr15, vr14, vr13 327 vilvh.w vr16, vr14, vr13 328 vsllwil.wu.hu vr9, vr7, 0 329 vexth.wu.hu vr10, vr7 330 vsllwil.wu.hu vr11, vr8, 0 331 vexth.wu.hu vr12, vr8 332 vadd.w vr9, vr9, vr3 333 vadd.w vr10, vr10, vr4 334 vadd.w vr11, vr11, vr15 335 vadd.w vr12, vr12, vr16 336 vst vr9, t2, REST_UNIT_STRIDE<<2 337 vst vr10, t2, (REST_UNIT_STRIDE<<2)+16 338 vst vr11, t2, (REST_UNIT_STRIDE<<2)+32 339 vst vr12, t2, (REST_UNIT_STRIDE<<2)+48 340 addi.d t2, t2, 64 341 342 addi.w t5, t5, -16 343 addi.d t3, t3, 16 344 blt zero, t5, .LBS3_H_W 345 346 addi.d a0, a0, REST_UNIT_STRIDE<<2 347 addi.d a1, a1, REST_UNIT_STRIDE<<1 348 addi.d a2, a2, REST_UNIT_STRIDE 349 addi.d a4, a4, -1 350 blt zero, a4, .LBS3_H_H 351 352.LBS3_H_END: 353endfunc 354 355/* 356void boxsum3_v(int32_t *sumsq, coef *sum, 357 const int w, const int h) 358*/ 359function boxsum3_v_8bpc_lsx 360 addi.d a0, a0, (REST_UNIT_STRIDE<<2) 361 addi.d a1, a1, (REST_UNIT_STRIDE<<1) 362 addi.w a3, a3, -4 363 addi.w a2, a2, -4 364 365.LBS3_V_H: 366 sub.w t3, a2, zero 367 addi.d t0, a0, 4 368 addi.d t1, a1, 2 369 addi.d t5, a0, 8 370 addi.d t6, a1, 4 371 372 vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 373 vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 374 vld vr2, t1, 4 // c 2 3 4 5 6 7 8 9 375 vld vr3, t0, 0 // a2 0 1 2 3 376 vld vr4, t0, 4 // b2 1 2 3 4 377 vld vr5, t0, 8 // c2 2 3 4 5 378 vld vr6, t0, 16 // 3 4 5 6 379 vld vr7, t0, 20 // 4 5 6 7 380 vld vr8, t0, 24 // 5 6 7 8 381 vadd.h vr9, vr0, vr1 382 vadd.h vr9, vr9, vr2 383 vadd.w vr10, vr3, vr4 384 vadd.w vr10, vr10, vr5 385 vadd.w vr11, vr6, vr7 386 vadd.w vr11, vr11, vr8 387 vpickve2gr.h t7, vr2, 6 388 vpickve2gr.w t8, vr8, 2 389 vst vr9, t6, 0 390 vst vr10, t5, 0 391 vst vr11, t5, 16 392 393 addi.d t1, t1, 16 394 addi.d t0, t0, 32 395 addi.d t5, t5, 32 396 addi.d t6, t6, 16 397 addi.d t3, t3, -8 398 ble t3, zero, .LBS3_V_H0 399 400.LBS3_V_W8: 401 vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 402 vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 403 vld vr2, t1, 4 // c 2 3 4 5 6 7 8 9 404 vld vr3, t0, 0 // a2 0 1 2 3 405 vld vr4, t0, 4 // b2 1 2 3 4 406 vld vr5, t0, 8 // c2 2 3 4 5 407 vld vr6, t0, 16 // 3 4 5 6 408 vld vr7, t0, 20 // 4 5 6 7 409 vld vr8, t0, 24 // 5 6 7 8 410 vinsgr2vr.h vr0, t7, 0 411 vinsgr2vr.w vr3, t8, 0 412 vpickve2gr.h t7, vr2, 6 413 vpickve2gr.w t8, vr8, 2 414 vadd.h vr9, vr0, vr1 415 vadd.w vr10, vr3, vr4 416 vadd.w vr11, vr6, vr7 417 vadd.h vr9, vr9, vr2 418 vadd.w vr10, vr10, vr5 419 vadd.w vr11, vr11, vr8 420 vst vr9, t6, 0 421 vst vr10, t5, 0 422 vst vr11, t5, 16 423 addi.d t3, t3, -8 424 addi.d t1, t1, 16 425 addi.d t0, t0, 32 426 addi.d t5, t5, 32 427 addi.d t6, t6, 16 428 blt zero, t3, .LBS3_V_W8 429 430.LBS3_V_H0: 431 addi.d a1, a1, REST_UNIT_STRIDE<<1 432 addi.d a0, a0, REST_UNIT_STRIDE<<2 433 addi.w a3, a3, -1 434 bnez a3, .LBS3_V_H 435 436.LBS3_V_END: 437endfunc 438 439/* 440boxsum3_selfguided_filter(int32_t *sumsq, coef *sum, 441 const int w, const int h, 442 const unsigned s) 443*/ 444function boxsum3_sgf_h_8bpc_lsx 445 addi.d a0, a0, REST_UNIT_STRIDE<<2 446 addi.d a0, a0, 12 // AA 447 addi.d a1, a1, REST_UNIT_STRIDE<<1 448 addi.d a1, a1, 6 // BB 449 la.local t8, dav1d_sgr_x_by_x 450 li.w t6, 455 451 vreplgr2vr.w vr20, t6 452 li.w t6, 255 453 vreplgr2vr.w vr22, t6 454 vaddi.wu vr21, vr22, 1 // 256 455 vreplgr2vr.w vr6, a4 456 vldi vr19, 0x809 457 addi.w a2, a2, 2 // w + 2 458 addi.w a3, a3, 2 // h + 2 459 460.LBS3SGF_H_H: 461 addi.w t2, a2, 0 462 addi.d t0, a0, -4 463 addi.d t1, a1, -2 464 465.LBS3SGF_H_W: 466 addi.w t2, t2, -8 467 vld vr0, t0, 0 // AA[i] 468 vld vr1, t0, 16 469 vld vr2, t1, 0 // BB[i] 470 471 vmul.w vr4, vr0, vr19 // a * n 472 vmul.w vr5, vr1, vr19 // a * n 473 vsllwil.w.h vr9, vr2, 0 474 vexth.w.h vr10, vr2 475 vmsub.w vr4, vr9, vr9 // p 476 vmsub.w vr5, vr10, vr10 // p 477 vmaxi.w vr4, vr4, 0 478 vmaxi.w vr5, vr5, 0 // p 479 vmul.w vr4, vr4, vr6 // p * s 480 vmul.w vr5, vr5, vr6 // p * s 481 vsrlri.w vr4, vr4, 20 482 vsrlri.w vr5, vr5, 20 // z 483 vmin.w vr4, vr4, vr22 484 vmin.w vr5, vr5, vr22 485 486 vpickve2gr.w t6, vr4, 0 487 ldx.bu t7, t8, t6 488 vinsgr2vr.w vr7, t7, 0 489 vpickve2gr.w t6, vr4, 1 490 ldx.bu t7, t8, t6 491 vinsgr2vr.w vr7, t7, 1 492 vpickve2gr.w t6, vr4, 2 493 ldx.bu t7, t8, t6 494 vinsgr2vr.w vr7, t7, 2 495 vpickve2gr.w t6, vr4, 3 496 ldx.bu t7, t8, t6 497 vinsgr2vr.w vr7, t7, 3 498 499 vpickve2gr.w t6, vr5, 0 500 ldx.bu t7, t8, t6 501 vinsgr2vr.w vr8, t7, 0 502 vpickve2gr.w t6, vr5, 1 503 ldx.bu t7, t8, t6 504 vinsgr2vr.w vr8, t7, 1 505 vpickve2gr.w t6, vr5, 2 506 ldx.bu t7, t8, t6 507 vinsgr2vr.w vr8, t7, 2 508 vpickve2gr.w t6, vr5, 3 509 ldx.bu t7, t8, t6 510 vinsgr2vr.w vr8, t7, 3 // x 511 512 vmul.w vr9, vr7, vr9 // x * BB[i] 513 vmul.w vr10, vr8, vr10 514 vmul.w vr9, vr9, vr20 // x * BB[i] * sgr_one_by_x 515 vmul.w vr10, vr10, vr20 516 vsrlri.w vr9, vr9, 12 517 vsrlri.w vr10, vr10, 12 518 vsub.w vr7, vr21, vr7 519 vsub.w vr8, vr21, vr8 520 vpickev.h vr8, vr8, vr7 521 522 vst vr9, t0, 0 523 vst vr10, t0, 16 524 vst vr8, t1, 0 525 addi.d t0, t0, 32 526 addi.d t1, t1, 16 527 blt zero, t2, .LBS3SGF_H_W 528 529 addi.d a0, a0, REST_UNIT_STRIDE<<2 530 addi.d a1, a1, REST_UNIT_STRIDE<<1 531 addi.w a3, a3, -1 532 bnez a3, .LBS3SGF_H_H 533endfunc 534 535/* 536boxsum3_selfguided_filter(coef *dst, pixel *src, 537 int32_t *sumsq, coef *sum, 538 const int w, const int h) 539*/ 540function boxsum3_sgf_v_8bpc_lsx 541 addi.d a1, a1, (3*REST_UNIT_STRIDE+3) // src 542 addi.d a2, a2, REST_UNIT_STRIDE<<2 543 addi.d a2, a2, (REST_UNIT_STRIDE<<2)+12 544 addi.d a3, a3, REST_UNIT_STRIDE<<2 545 addi.d a3, a3, 6 546.LBS3SGF_V_H: 547 // A int32_t *sumsq 548 addi.d t0, a2, -(REST_UNIT_STRIDE<<2) // -stride 549 addi.d t1, a2, 0 // sumsq 550 addi.d t2, a2, REST_UNIT_STRIDE<<2 // +stride 551 addi.d t6, a1, 0 552 addi.w t7, a4, 0 553 addi.d t8, a0, 0 554 // B coef *sum 555 addi.d t3, a3, -(REST_UNIT_STRIDE<<1) // -stride 556 addi.d t4, a3, 0 557 addi.d t5, a3, REST_UNIT_STRIDE<<1 558 559.LBS3SGF_V_W: 560 vld vr0, t0, 0 // P[i - REST_UNIT_STRIDE] 561 vld vr1, t0, 16 562 vld vr2, t1, -4 // P[i-1] 563 vld vr3, t1, 12 564 vld vr4, t2, 0 // P[i + REST_UNIT_STRIDE] 565 vld vr5, t2, 16 566 vld vr6, t1, 0 // p[i] 567 vld vr7, t1, 16 568 vld vr8, t1, 4 // p[i+1] 569 vld vr9, t1, 20 570 571 vld vr10, t0, -4 // P[i - 1 - REST_UNIT_STRIDE] 572 vld vr11, t0, 12 573 vld vr12, t2, -4 // P[i - 1 + REST_UNIT_STRIDE] 574 vld vr13, t2, 12 575 vld vr14, t0, 4 // P[i + 1 - REST_UNIT_STRIDE] 576 vld vr15, t0, 20 577 vld vr16, t2, 4 // P[i + 1 + REST_UNIT_STRIDE] 578 vld vr17, t2, 20 579 580 vadd.w vr0, vr2, vr0 581 vadd.w vr4, vr6, vr4 582 vadd.w vr0, vr0, vr8 583 vadd.w vr20, vr0, vr4 584 vslli.w vr20, vr20, 2 // 0 1 2 3 585 vadd.w vr0, vr1, vr3 586 vadd.w vr4, vr5, vr7 587 vadd.w vr0, vr0, vr9 588 vadd.w vr21, vr0, vr4 589 vslli.w vr21, vr21, 2 // 4 5 6 7 590 vadd.w vr12, vr10, vr12 591 vadd.w vr16, vr14, vr16 592 vadd.w vr22, vr12, vr16 593 vslli.w vr23, vr22, 1 594 vadd.w vr22, vr23, vr22 595 vadd.w vr11, vr11, vr13 596 vadd.w vr15, vr15, vr17 597 vadd.w vr0, vr11, vr15 598 vslli.w vr23, vr0, 1 599 vadd.w vr23, vr23, vr0 600 vadd.w vr20, vr20, vr22 // b 601 vadd.w vr21, vr21, vr23 602 603 // B coef *sum 604 vld vr0, t3, 0 // P[i - REST_UNIT_STRIDE] 605 vld vr1, t4, -2 // p[i - 1] 606 vld vr2, t4, 0 // p[i] 607 vld vr3, t4, 2 // p[i + 1] 608 vld vr4, t5, 0 // P[i + REST_UNIT_STRIDE] 609 vld vr5, t3, -2 // P[i - 1 - REST_UNIT_STRIDE] 610 vld vr6, t5, -2 // P[i - 1 + REST_UNIT_STRIDE] 611 vld vr7, t3, 2 // P[i + 1 - REST_UNIT_STRIDE] 612 vld vr8, t5, 2 // P[i + 1 + REST_UNIT_STRIDE] 613 vaddwev.w.h vr9, vr0, vr1 614 vaddwod.w.h vr10, vr0, vr1 615 vaddwev.w.h vr11, vr2, vr3 616 vaddwod.w.h vr12, vr2, vr3 617 vadd.w vr9, vr11, vr9 618 vadd.w vr10, vr12, vr10 619 vilvl.w vr11, vr10, vr9 // 0 1 2 3 620 vilvh.w vr12, vr10, vr9 // 4 5 6 7 621 vsllwil.w.h vr0, vr4, 0 622 vexth.w.h vr1, vr4 623 vadd.w vr0, vr11, vr0 624 vadd.w vr1, vr12, vr1 625 vslli.w vr0, vr0, 2 626 vslli.w vr1, vr1, 2 627 vaddwev.w.h vr9, vr5, vr6 628 vaddwod.w.h vr10, vr5, vr6 629 vaddwev.w.h vr11, vr7, vr8 630 vaddwod.w.h vr12, vr7, vr8 631 vadd.w vr9, vr11, vr9 632 vadd.w vr10, vr12, vr10 633 vilvl.w vr13, vr10, vr9 634 vilvh.w vr14, vr10, vr9 635 vslli.w vr15, vr13, 1 636 vslli.w vr16, vr14, 1 637 vadd.w vr15, vr13, vr15 // a 638 vadd.w vr16, vr14, vr16 639 vadd.w vr22, vr0, vr15 640 vadd.w vr23, vr1, vr16 641 vld vr0, t6, 0 // src 642 vsllwil.hu.bu vr0, vr0, 0 643 vsllwil.wu.hu vr1, vr0, 0 644 vexth.wu.hu vr2, vr0 645 vmadd.w vr20, vr22, vr1 646 vmadd.w vr21, vr23, vr2 647 vssrlrni.h.w vr21, vr20, 9 648 vst vr21, t8, 0 649 addi.d t8, t8, 16 650 651 addi.d t0, t0, 32 652 addi.d t1, t1, 32 653 addi.d t2, t2, 32 654 addi.d t3, t3, 16 655 addi.d t4, t4, 16 656 addi.d t5, t5, 16 657 addi.d t6, t6, 8 658 addi.w t7, t7, -8 659 blt zero, t7, .LBS3SGF_V_W 660 661 addi.w a5, a5, -1 662 addi.d a0, a0, 384*2 663 addi.d a1, a1, REST_UNIT_STRIDE 664 addi.d a3, a3, REST_UNIT_STRIDE<<1 665 addi.d a2, a2, REST_UNIT_STRIDE<<2 666 bnez a5, .LBS3SGF_V_H 667endfunc 668 669#define FILTER_OUT_STRIDE (384) 670 671/* 672sgr_3x3_finish_c(const pixel *p, const ptrdiff_t stride, 673 const int16_t *dst, const int w1; 674 const int w, const int h); 675*/ 676function sgr_3x3_finish_8bpc_lsx 677 vreplgr2vr.w vr3, a3 // w1 678 andi t4, a4, 0x7 679 sub.w t5, a4, t4 680 681 beq zero, t5, .LSGR3X3_REM 682 683.LSGR3X3_H: 684 addi.d t0, a0, 0 685 addi.d t1, a2, 0 686 addi.w t2, t5, 0 687 andi t4, a4, 0x7 688.LSGR3X3_W: 689 vld vr0, t0, 0 690 vld vr1, t1, 0 691 vsllwil.hu.bu vr2, vr0, 4 // u 8 h 692 vsllwil.wu.hu vr4, vr2, 0 // p 693 vexth.wu.hu vr5, vr2 // p 694 vslli.w vr6, vr4, 7 695 vslli.w vr7, vr5, 7 696 vsllwil.w.h vr8, vr1, 0 // dst 697 vexth.w.h vr9, vr1 // dst 698 vsub.w vr8, vr8, vr4 699 vsub.w vr9, vr9, vr5 700 vmadd.w vr6, vr8, vr3 // v 0 - 3 701 vmadd.w vr7, vr9, vr3 // v 4 - 7 702 vssrarni.hu.w vr7, vr6, 11 703 vssrlni.bu.h vr7, vr7, 0 704 vstelm.d vr7, t0, 0, 0 705 addi.d t0, t0, 8 706 addi.d t1, t1, 16 707 addi.d t2, t2, -8 708 bne zero, t2, .LSGR3X3_W 709 710 beq t4, zero, .LSGR3X3_NOREM 711 712 vld vr0, t0, 0 713 vld vr1, t1, 0 714 vsllwil.hu.bu vr2, vr0, 4 // u 8 h 715 vsllwil.wu.hu vr4, vr2, 0 // p 716 vexth.wu.hu vr5, vr2 // p 717 vslli.w vr6, vr4, 7 718 vslli.w vr7, vr5, 7 719 vsllwil.w.h vr8, vr1, 0 // dst 720 vexth.w.h vr9, vr1 // dst 721 vsub.w vr8, vr8, vr4 722 vsub.w vr9, vr9, vr5 723 vmadd.w vr6, vr8, vr3 // v 0 - 3 724 vmadd.w vr7, vr9, vr3 // v 4 - 7 725 vssrarni.hu.w vr7, vr6, 11 726 vssrlni.bu.h vr7, vr7, 0 727 728.LSGR3X3_ST: 729 vstelm.b vr7, t0, 0, 0 730 addi.d t0, t0, 1 731 vbsrl.v vr7, vr7, 1 732 addi.w t4, t4, -1 733 bnez t4, .LSGR3X3_ST 734 735.LSGR3X3_NOREM: 736 addi.w a5, a5, -1 737 add.d a0, a0, a1 738 addi.d a2, a2, (FILTER_OUT_STRIDE<<1) 739 bnez a5, .LSGR3X3_H 740 b .LSGR3X3_END 741 742.LSGR3X3_REM: 743 andi t4, a4, 0x7 744 addi.d t0, a0, 0 745 vld vr0, t0, 0 746 vld vr1, a2, 0 747 vsllwil.hu.bu vr2, vr0, 4 // u 8 h 748 vsllwil.wu.hu vr4, vr2, 0 // p 749 vexth.wu.hu vr5, vr2 // p 750 vslli.w vr6, vr4, 7 751 vslli.w vr7, vr5, 7 752 vsllwil.w.h vr8, vr1, 0 // dst 753 vexth.w.h vr9, vr1 // dst 754 vsub.w vr8, vr8, vr4 755 vsub.w vr9, vr9, vr5 756 vmadd.w vr6, vr8, vr3 // v 0 - 3 757 vmadd.w vr7, vr9, vr3 // v 4 - 7 758 vssrarni.hu.w vr7, vr6, 11 759 vssrlni.bu.h vr7, vr7, 0 760 761.LSGR3X3_REM_ST: 762 vstelm.b vr7, t0, 0, 0 763 addi.d t0, t0, 1 764 vbsrl.v vr7, vr7, 1 765 addi.w t4, t4, -1 766 bnez t4, .LSGR3X3_REM_ST 767 addi.w a5, a5, -1 768 add.d a0, a0, a1 769 addi.d a2, a2, (FILTER_OUT_STRIDE<<1) 770 bnez a5, .LSGR3X3_REM 771 772.LSGR3X3_END: 773endfunc 774 775/* 776void boxsum5(int32_t *sumsq, coef *sum, 777 const pixel *const src, 778 const int w, const int h) 779*/ 780function boxsum5_h_8bpc_lsx 781 addi.w a4, a4, -4 782 addi.d a0, a0, REST_UNIT_STRIDE<<2 783 addi.d a1, a1, REST_UNIT_STRIDE<<1 784 li.w t6, 1 785.LBOXSUM5_H_H: 786 addi.w t3, a3, 0 787 addi.d t2, a2, 0 788 addi.d t0, a0, 0 789 addi.d t1, a1, 0 790 791.LBOXSUM5_H_W: 792 vld vr0, t2, 0 // a 793 vld vr1, t2, REST_UNIT_STRIDE // b 794 vld vr2, t2, REST_UNIT_STRIDE<<1 // c 795 vld vr3, t2, REST_UNIT_STRIDE*3 // d 796 vld vr4, t2, REST_UNIT_STRIDE<<2 // e 797 798 vilvl.b vr5, vr1, vr0 799 vilvh.b vr6, vr1, vr0 800 vilvl.b vr7, vr3, vr2 801 vilvh.b vr8, vr3, vr2 802 //sum_v 803 vhaddw.hu.bu vr9, vr5, vr5 // 0 1 2 3 4 5 6 7 804 vhaddw.hu.bu vr10, vr6, vr6 // 8 9 10 11 12 13 14 15 a+b 805 vhaddw.hu.bu vr11, vr7, vr7 806 vhaddw.hu.bu vr12, vr8, vr8 807 vadd.h vr9, vr9, vr11 808 vadd.h vr10, vr10, vr12 // a + b + c + d 809 vsllwil.hu.bu vr11, vr4, 0 810 vexth.hu.bu vr12, vr4 811 vadd.h vr9, vr9, vr11 812 vadd.h vr10, vr10, vr12 813 vst vr9, t1, 0 814 vst vr10, t1, 16 815 addi.d t1, t1, 32 816 817 // sumsq 818 vmulwev.h.bu vr9, vr5, vr5 // a*a 0 1 2 3 4 5 6 7 819 vmulwev.h.bu vr10, vr6, vr6 // a*a 8 9 10 11 12 13 14 15 820 vmulwod.h.bu vr13, vr5, vr5 // b*b 0 1 2 3 4 5 6 7 821 vmulwod.h.bu vr14, vr6, vr6 // b*b 8 9 10 11 12 13 14 15 822 vmulwev.h.bu vr15, vr7, vr7 // c*c 0 1 2 3 4 5 6 7 823 vmulwev.h.bu vr16, vr8, vr8 // c*c 8 9 10 11 12 13 14 15 824 vmulwod.h.bu vr17, vr7, vr7 // d*d 0 1 2 3 4 5 6 7 825 vmulwod.h.bu vr18, vr8, vr8 // d*d 8 9 10 11 12 13 14 15 826 vaddwev.w.hu vr5, vr9, vr13 // 0 2 4 6 827 vaddwod.w.hu vr6, vr9, vr13 // 1 3 5 7 828 vaddwev.w.hu vr7, vr10, vr14 // 8 10 12 14 829 vaddwod.w.hu vr8, vr10, vr14 // 9 11 13 15 a + b 830 vaddwev.w.hu vr19, vr15, vr17 // 0 2 4 6 831 vaddwod.w.hu vr20, vr15, vr17 // 1 3 5 7 832 vaddwev.w.hu vr21, vr16, vr18 // 8 10 12 14 833 vaddwod.w.hu vr22, vr16, vr18 // 9 11 13 15 c + d 834 vadd.w vr5, vr5, vr19 835 vadd.w vr6, vr6, vr20 836 vadd.w vr7, vr7, vr21 837 vadd.w vr8, vr8, vr22 838 vilvl.w vr19, vr6, vr5 839 vilvh.w vr20, vr6, vr5 840 vilvl.w vr21, vr8, vr7 841 vilvh.w vr22, vr8, vr7 842 vmul.h vr11, vr11, vr11 843 vmul.h vr12, vr12, vr12 844 vsllwil.wu.hu vr0, vr11, 0 845 vexth.wu.hu vr1, vr11 846 vsllwil.wu.hu vr2, vr12, 0 847 vexth.wu.hu vr3, vr12 848 vadd.w vr19, vr19, vr0 849 vadd.w vr20, vr20, vr1 850 vadd.w vr21, vr21, vr2 851 vadd.w vr22, vr22, vr3 852 vst vr19, t0, 0 853 vst vr20, t0, 16 854 vst vr21, t0, 32 855 vst vr22, t0, 48 856 addi.d t0, t0, 64 857 addi.d t2, t2, 16 858 addi.w t3, t3, -16 859 blt zero, t3, .LBOXSUM5_H_W 860 861 addi.d a0, a0, REST_UNIT_STRIDE<<2 862 addi.d a1, a1, REST_UNIT_STRIDE<<1 863 addi.d a2, a2, REST_UNIT_STRIDE 864 addi.d a4, a4, -1 865 bnez a4, .LBOXSUM5_H_H 866endfunc 867 868/* 869void boxsum5_h(int32_t *sumsq, coef *sum, 870 const int w, const int h) 871*/ 872function boxsum5_v_8bpc_lsx 873 addi.d a0, a0, (REST_UNIT_STRIDE<<2) 874 addi.d a1, a1, (REST_UNIT_STRIDE<<1) 875 addi.w a3, a3, -4 876 addi.w a2, a2, -4 877 878.LBOXSUM5_V_H: 879 addi.w t3, a2, 0 880 addi.d t0, a0, 0 881 addi.d t1, a1, 0 882 addi.d t2, a0, 8 883 addi.d t3, a1, 4 884 addi.d t4, a2, 0 885 886 vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 887 vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 888 vld vr2, t1, 4 // c 2 889 vld vr3, t1, 6 // d 3 890 vld vr4, t1, 8 // e 4 5 6 7 8 9 10 11 891 vadd.h vr5, vr0, vr1 892 vadd.h vr6, vr2, vr3 893 vpickve2gr.w t5, vr4, 2 894 vadd.h vr5, vr5, vr6 895 vadd.h vr5, vr5, vr4 896 vst vr5, t3, 0 897 898 vld vr0, t0, 0 // 0 1 2 3 a 899 vld vr1, t0, 4 // 1 2 3 4 b 900 vld vr2, t0, 8 // 2 3 4 5 c 901 vld vr3, t0, 12 // 3 4 5 6 d 902 vld vr4, t0, 16 // 4 5 6 7 e a 903 vld vr5, t0, 20 // 5 6 7 8 b 904 vld vr6, t0, 24 // 6 7 8 9 c 905 vld vr7, t0, 28 // 7 8 9 10 d 906 vld vr8, t0, 32 // 8 9 10 11 e 907 908 vadd.w vr9, vr0, vr1 909 vadd.w vr10, vr2, vr3 910 vadd.w vr9, vr9, vr10 911 vadd.w vr9, vr9, vr4 912 vadd.w vr10, vr4, vr5 913 vadd.w vr11, vr6, vr7 914 vadd.w vr10, vr10, vr8 915 vadd.w vr10, vr10, vr11 916 vst vr9, t2, 0 917 vst vr10, t2, 16 918 919 addi.d t3, t3, 16 920 addi.d t1, t1, 16 921 addi.d t0, t0, 32 922 addi.d t2, t2, 32 923 addi.w t4, t4, -8 924 ble t4, zero, .LBOXSUM5_V_H1 925 926.LBOXSUM5_V_W: 927 vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 928 vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 929 vld vr2, t1, 4 // c 2 930 vld vr3, t1, 6 // d 3 931 vld vr4, t1, 8 // e 4 5 6 7 8 9 10 11 932 vinsgr2vr.w vr0, t5, 0 933 vpickve2gr.w t5, vr4, 2 934 vextrins.h vr1, vr0, 0x01 935 vadd.h vr5, vr0, vr1 936 vadd.h vr6, vr2, vr3 937 vadd.h vr5, vr5, vr6 938 vadd.h vr5, vr5, vr4 939 vst vr5, t3, 0 940 941 vaddi.hu vr0, vr8, 0 // 8 9 10 11 a 942 vld vr1, t0, 4 // 9 10 11 12 b 943 vld vr2, t0, 8 // 10 11 12 13 c 944 vld vr3, t0, 12 // 14 15 16 17 d 945 vld vr4, t0, 16 // 15 16 17 18 e a 946 vld vr5, t0, 20 // 16 17 18 19 b 947 vld vr6, t0, 24 // 17 18 19 20 c 948 vld vr7, t0, 28 // 18 19 20 21 d 949 vld vr8, t0, 32 // 19 20 21 22 e 950 vextrins.w vr1, vr0, 0x01 951 vadd.w vr9, vr0, vr1 952 vadd.w vr10, vr2, vr3 953 vadd.w vr9, vr9, vr10 954 vadd.w vr9, vr9, vr4 955 vadd.w vr10, vr4, vr5 956 vadd.w vr11, vr6, vr7 957 vadd.w vr10, vr10, vr8 958 vadd.w vr10, vr10, vr11 959 vst vr9, t2, 0 960 vst vr10, t2, 16 961 962 addi.d t3, t3, 16 963 addi.d t1, t1, 16 964 addi.d t0, t0, 32 965 addi.d t2, t2, 32 966 addi.w t4, t4, -8 967 blt zero, t4, .LBOXSUM5_V_W 968 969.LBOXSUM5_V_H1: 970 addi.d a1, a1, REST_UNIT_STRIDE<<1 971 addi.d a0, a0, REST_UNIT_STRIDE<<2 972 addi.w a3, a3, -1 973 bnez a3, .LBOXSUM5_V_H 974endfunc 975 976/* 977selfguided_filter(int32_t *sumsq, coef *sum, 978 const int w, const int h, 979 const unsigned s) 980*/ 981function boxsum5_sgf_h_8bpc_lsx 982 addi.d a0, a0, REST_UNIT_STRIDE<<2 983 addi.d a0, a0, 12 // AA 984 addi.d a1, a1, REST_UNIT_STRIDE<<1 985 addi.d a1, a1, 6 // BB 986 la.local t8, dav1d_sgr_x_by_x 987 li.w t6, 164 988 vreplgr2vr.w vr20, t6 989 li.w t6, 255 990 vreplgr2vr.w vr22, t6 991 vaddi.wu vr21, vr22, 1 // 256 992 vreplgr2vr.w vr6, a4 993 vldi vr19, 0x819 994 addi.w a2, a2, 2 // w + 2 995 addi.w a3, a3, 2 // h + 2 996 997.LBS5SGF_H_H: 998 addi.w t2, a2, 0 999 addi.d t0, a0, -4 1000 addi.d t1, a1, -2 1001 1002.LBS5SGF_H_W: 1003 vld vr0, t0, 0 // AA[i] 1004 vld vr1, t0, 16 1005 vld vr2, t1, 0 // BB[i] 1006 1007 vmul.w vr4, vr0, vr19 // a * n 1008 vmul.w vr5, vr1, vr19 // a * n 1009 vsllwil.w.h vr9, vr2, 0 1010 vexth.w.h vr10, vr2 1011 vmsub.w vr4, vr9, vr9 // p 1012 vmsub.w vr5, vr10, vr10 // p 1013 vmaxi.w vr4, vr4, 0 1014 vmaxi.w vr5, vr5, 0 // p 1015 vmul.w vr4, vr4, vr6 // p * s 1016 vmul.w vr5, vr5, vr6 // p * s 1017 vsrlri.w vr4, vr4, 20 1018 vsrlri.w vr5, vr5, 20 // z 1019 vmin.w vr4, vr4, vr22 1020 vmin.w vr5, vr5, vr22 1021 1022 // load table data 1023 vpickve2gr.w t6, vr4, 0 1024 ldx.bu t7, t8, t6 1025 vinsgr2vr.w vr7, t7, 0 1026 vpickve2gr.w t6, vr4, 1 1027 ldx.bu t7, t8, t6 1028 vinsgr2vr.w vr7, t7, 1 1029 vpickve2gr.w t6, vr4, 2 1030 ldx.bu t7, t8, t6 1031 vinsgr2vr.w vr7, t7, 2 1032 vpickve2gr.w t6, vr4, 3 1033 ldx.bu t7, t8, t6 1034 vinsgr2vr.w vr7, t7, 3 1035 1036 vpickve2gr.w t6, vr5, 0 1037 ldx.bu t7, t8, t6 1038 vinsgr2vr.w vr8, t7, 0 1039 vpickve2gr.w t6, vr5, 1 1040 ldx.bu t7, t8, t6 1041 vinsgr2vr.w vr8, t7, 1 1042 vpickve2gr.w t6, vr5, 2 1043 ldx.bu t7, t8, t6 1044 vinsgr2vr.w vr8, t7, 2 1045 vpickve2gr.w t6, vr5, 3 1046 ldx.bu t7, t8, t6 1047 vinsgr2vr.w vr8, t7, 3 // x 1048 1049 vmul.w vr9, vr7, vr9 // x * BB[i] 1050 vmul.w vr10, vr8, vr10 1051 vmul.w vr9, vr9, vr20 // x * BB[i] * sgr_one_by_x 1052 vmul.w vr10, vr10, vr20 1053 vsrlri.w vr9, vr9, 12 1054 vsrlri.w vr10, vr10, 12 1055 vsub.w vr7, vr21, vr7 1056 vsub.w vr8, vr21, vr8 1057 vpickev.h vr8, vr8, vr7 1058 vst vr9, t0, 0 1059 vst vr10, t0, 16 1060 vst vr8, t1, 0 1061 addi.d t0, t0, 32 1062 addi.d t1, t1, 16 1063 addi.w t2, t2, -8 1064 blt zero, t2, .LBS5SGF_H_W 1065 1066 addi.d a0, a0, REST_UNIT_STRIDE<<2 1067 addi.d a0, a0, REST_UNIT_STRIDE<<2 1068 addi.d a1, a1, REST_UNIT_STRIDE<<2 1069 addi.w a3, a3, -2 1070 blt zero, a3, .LBS5SGF_H_H 1071endfunc 1072 1073/* 1074selfguided_filter(coef *dst, pixel *src, 1075 int32_t *sumsq, coef *sum, 1076 const int w, const int h) 1077*/ 1078function boxsum5_sgf_v_8bpc_lsx 1079 addi.d a1, a1, 3*REST_UNIT_STRIDE+3 // src 1080 addi.d a2, a2, (2*REST_UNIT_STRIDE+3)<<1 // A 1081 addi.d a2, a2, (2*REST_UNIT_STRIDE+3)<<1 1082 addi.d a3, a3, (2*REST_UNIT_STRIDE+3)<<1 // B 1083 addi.w a5, a5, -1 1084 vldi vr10, 0x806 1085 vldi vr11, 0x805 1086 vldi vr22, 0x406 1087 1088.LBS5SGF_V_H: 1089 addi.d t0, a0, 0 1090 addi.d t1, a1, 0 1091 addi.d t2, a2, 0 1092 addi.d t3, a3, 0 1093 addi.w t4, a4, 0 1094 1095 addi.d t5, a0, 384*2 1096 addi.d t6, a1, REST_UNIT_STRIDE 1097 addi.d t7, a2, REST_UNIT_STRIDE<<2 1098 addi.d t8, a3, REST_UNIT_STRIDE<<1 // B 1099.LBS5SGF_V_W: 1100 // a 1101 vld vr0, t3, -REST_UNIT_STRIDE*2 1102 vld vr1, t3, REST_UNIT_STRIDE*2 1103 vld vr2, t3, (-REST_UNIT_STRIDE-1)*2 1104 vld vr3, t3, (REST_UNIT_STRIDE-1)*2 1105 vld vr4, t3, (1-REST_UNIT_STRIDE)*2 1106 vld vr5, t3, (1+REST_UNIT_STRIDE)*2 1107 vaddwev.w.h vr6, vr0, vr1 1108 vaddwod.w.h vr7, vr0, vr1 1109 vmul.w vr6, vr6, vr10 1110 vmul.w vr7, vr7, vr10 1111 vaddwev.w.h vr8, vr2, vr3 1112 vaddwod.w.h vr9, vr2, vr3 1113 vaddwev.w.h vr12, vr4, vr5 1114 vaddwod.w.h vr13, vr4, vr5 1115 vadd.w vr8, vr8, vr12 1116 vadd.w vr9, vr9, vr13 1117 vmadd.w vr6, vr8, vr11 1118 vmadd.w vr7, vr9, vr11 1119 vilvl.w vr18, vr7, vr6 1120 vilvh.w vr19, vr7, vr6 1121 // b 1122 vld vr0, t2, -REST_UNIT_STRIDE*4 1123 vld vr1, t2, -REST_UNIT_STRIDE*4+16 1124 vld vr2, t2, REST_UNIT_STRIDE*4 1125 vld vr3, t2, REST_UNIT_STRIDE*4+16 1126 vld vr4, t2, (-REST_UNIT_STRIDE-1)*4 1127 vld vr5, t2, (-REST_UNIT_STRIDE-1)*4+16 1128 vld vr8, t2, (REST_UNIT_STRIDE-1)*4 1129 vld vr9, t2, (REST_UNIT_STRIDE-1)*4+16 1130 vld vr12, t2, (1-REST_UNIT_STRIDE)*4 1131 vld vr13, t2, (1-REST_UNIT_STRIDE)*4+16 1132 vld vr14, t2, (1+REST_UNIT_STRIDE)*4 1133 vld vr15, t2, (1+REST_UNIT_STRIDE)*4+16 1134 vadd.w vr0, vr0, vr2 // 0 1 2 3 1135 vadd.w vr1, vr1, vr3 // 4 5 6 7 1136 vmul.w vr20, vr0, vr10 1137 vmul.w vr21, vr1, vr10 1138 vadd.w vr4, vr4, vr8 // 0 1 2 3 1139 vadd.w vr5, vr5, vr9 // 4 5 6 7 1140 vadd.w vr12, vr12, vr14 1141 vadd.w vr13, vr13, vr15 1142 vadd.w vr12, vr12, vr4 1143 vadd.w vr13, vr13, vr5 1144 vmadd.w vr20, vr12, vr11 1145 vmadd.w vr21, vr13, vr11 1146 vld vr2, t1, 0 1147 vsllwil.hu.bu vr2, vr2, 0 1148 vsllwil.wu.hu vr3, vr2, 0 1149 vexth.wu.hu vr4, vr2 1150 vmadd.w vr20, vr18, vr3 1151 vmadd.w vr21, vr19, vr4 1152 vssrlrni.h.w vr21, vr20, 9 1153 vst vr21, t0, 0 1154 1155 addi.d t1, t1, 8 1156 addi.d t2, t2, 32 1157 addi.d t3, t3, 16 1158 1159 // a 1160 vld vr0, t8, 0 1161 vld vr1, t8, -2 1162 vld vr2, t8, 2 1163 vmulwev.w.h vr3, vr0, vr22 1164 vmulwod.w.h vr4, vr0, vr22 1165 vaddwev.w.h vr5, vr1, vr2 1166 vaddwod.w.h vr6, vr1, vr2 1167 vmadd.w vr3, vr5, vr11 1168 vmadd.w vr4, vr6, vr11 1169 vilvl.w vr19, vr4, vr3 1170 vilvh.w vr20, vr4, vr3 1171 // b 1172 vld vr0, t7, 0 1173 vld vr1, t7, -4 1174 vld vr2, t7, 4 1175 vld vr5, t7, 16 1176 vld vr6, t7, 12 1177 vld vr7, t7, 20 1178 vmul.w vr8, vr0, vr10 1179 vmul.w vr9, vr5, vr10 1180 vadd.w vr12, vr1, vr2 1181 vadd.w vr13, vr6, vr7 1182 vmadd.w vr8, vr12, vr11 1183 vmadd.w vr9, vr13, vr11 1184 vld vr2, t6, 0 1185 vsllwil.hu.bu vr2, vr2, 0 1186 vsllwil.wu.hu vr3, vr2, 0 1187 vexth.wu.hu vr4, vr2 1188 vmadd.w vr8, vr19, vr3 1189 vmadd.w vr9, vr20, vr4 1190 vssrlrni.h.w vr9, vr8, 8 1191 vst vr9, t0, 384*2 1192 1193 addi.d t0, t0, 16 1194 addi.d t8, t8, 16 1195 addi.d t7, t7, 32 1196 addi.d t6, t6, 8 1197 addi.w t4, t4, -8 1198 blt zero, t4, .LBS5SGF_V_W 1199 1200 addi.w a5, a5, -2 1201 addi.d a0, a0, 384*4 // dst 1202 addi.d a1, a1, REST_UNIT_STRIDE<<1 // src 1203 addi.d a2, a2, REST_UNIT_STRIDE<<2 // 1204 addi.d a2, a2, REST_UNIT_STRIDE<<2 1205 addi.d a3, a3, REST_UNIT_STRIDE<<2 // 1206 blt zero, a5, .LBS5SGF_V_H 1207 bnez a5, .LBS5SGF_END 1208.LBS5SGF_V_W1: 1209 // a 1210 vld vr0, a3, -REST_UNIT_STRIDE*2 1211 vld vr1, a3, REST_UNIT_STRIDE*2 1212 vld vr2, a3, (-REST_UNIT_STRIDE-1)*2 1213 vld vr3, a3, (REST_UNIT_STRIDE-1)*2 1214 vld vr4, a3, (1-REST_UNIT_STRIDE)*2 1215 vld vr5, a3, (1+REST_UNIT_STRIDE)*2 1216 vaddwev.w.h vr6, vr0, vr1 1217 vaddwod.w.h vr7, vr0, vr1 1218 vmul.w vr6, vr6, vr10 1219 vmul.w vr7, vr7, vr10 1220 vaddwev.w.h vr8, vr2, vr3 1221 vaddwod.w.h vr9, vr2, vr3 1222 vaddwev.w.h vr12, vr4, vr5 1223 vaddwod.w.h vr13, vr4, vr5 1224 vadd.w vr8, vr8, vr12 1225 vadd.w vr9, vr9, vr13 1226 vmadd.w vr6, vr8, vr11 1227 vmadd.w vr7, vr9, vr11 1228 vilvl.w vr18, vr7, vr6 1229 vilvh.w vr19, vr7, vr6 1230 // b 1231 vld vr0, a2, -REST_UNIT_STRIDE*4 1232 vld vr1, a2, -REST_UNIT_STRIDE*4+16 1233 vld vr2, a2, REST_UNIT_STRIDE*4 1234 vld vr3, a2, REST_UNIT_STRIDE*4+16 1235 vld vr4, a2, (-REST_UNIT_STRIDE-1)*4 1236 vld vr5, a2, (-REST_UNIT_STRIDE-1)*4+16 1237 vld vr8, a2, (REST_UNIT_STRIDE-1)*4 1238 vld vr9, a2, (REST_UNIT_STRIDE-1)*4+16 1239 vld vr12, a2, (1-REST_UNIT_STRIDE)*4 1240 vld vr13, a2, (1-REST_UNIT_STRIDE)*4+16 1241 vld vr14, a2, (1+REST_UNIT_STRIDE)*4 1242 vld vr15, a2, (1+REST_UNIT_STRIDE)*4+16 1243 vadd.w vr0, vr0, vr2 // 0 1 2 3 1244 vadd.w vr1, vr1, vr3 // 4 5 6 7 1245 vmul.w vr20, vr0, vr10 1246 vmul.w vr21, vr1, vr10 1247 vadd.w vr4, vr4, vr8 // 0 1 2 3 1248 vadd.w vr5, vr5, vr9 // 4 5 6 7 1249 vadd.w vr12, vr12, vr14 1250 vadd.w vr13, vr13, vr15 1251 vadd.w vr12, vr12, vr4 1252 vadd.w vr13, vr13, vr5 1253 vmadd.w vr20, vr12, vr11 1254 vmadd.w vr21, vr13, vr11 1255 vld vr2, a1, 0 1256 vsllwil.hu.bu vr2, vr2, 0 1257 vsllwil.wu.hu vr3, vr2, 0 1258 vexth.wu.hu vr4, vr2 1259 vmadd.w vr20, vr18, vr3 1260 vmadd.w vr21, vr19, vr4 1261 vssrlrni.h.w vr21, vr20, 9 1262 vst vr21, a0, 0 1263 addi.d a3, a3, 16 1264 addi.d a2, a2, 32 1265 addi.d a1, a1, 8 1266 addi.d a0, a0, 16 1267 addi.w a4, a4, -8 1268 blt zero, a4, .LBS5SGF_V_W1 1269.LBS5SGF_END: 1270endfunc 1271 1272/* 1273void dav1d_sgr_mix_finish_lsx(uint8_t *p, const ptrdiff_t stride, 1274 const int16_t *dst0, const int16_t *dst1, 1275 const int w0, const int w1, 1276 const int w, const int h); 1277*/ 1278function sgr_mix_finish_8bpc_lsx 1279 vreplgr2vr.w vr3, a4 // w0 1280 vreplgr2vr.w vr13, a5 // w1 1281 andi t4, a6, 0x7 1282 sub.w t5, a6, t4 1283 1284 beq zero, t5, .LSGRMIX_REM 1285 1286.LSGRMIX_H: 1287 addi.d t0, a0, 0 1288 addi.d t1, a2, 0 // dst0 1289 addi.d t3, a3, 0 // dst1 1290 addi.w t2, t5, 0 1291 andi t4, a6, 0x7 1292.LSGRMIX_W: 1293 vld vr0, t0, 0 1294 vld vr1, t1, 0 1295 vld vr10, t3, 0 1296 vsllwil.hu.bu vr2, vr0, 4 // u 8 h 1297 vsllwil.wu.hu vr4, vr2, 0 // u 0 1 2 3 1298 vexth.wu.hu vr5, vr2 // u 4 5 6 7 1299 vslli.w vr6, vr4, 7 1300 vslli.w vr7, vr5, 7 1301 vsllwil.w.h vr8, vr1, 0 // dst0 1302 vexth.w.h vr9, vr1 // dst0 1303 vsub.w vr8, vr8, vr4 1304 vsub.w vr9, vr9, vr5 1305 vmadd.w vr6, vr8, vr3 // v 0 - 3 1306 vmadd.w vr7, vr9, vr3 // v 4 - 7 1307 1308 vsllwil.w.h vr11, vr10, 0 // dst1 1309 vexth.w.h vr12, vr10 // dst1 1310 vsub.w vr11, vr11, vr4 1311 vsub.w vr12, vr12, vr5 1312 vmadd.w vr6, vr11, vr13 1313 vmadd.w vr7, vr12, vr13 1314 1315 vssrarni.hu.w vr7, vr6, 11 1316 vssrlni.bu.h vr7, vr7, 0 1317 vstelm.d vr7, t0, 0, 0 1318 addi.d t0, t0, 8 1319 addi.d t1, t1, 16 1320 addi.d t3, t3, 16 1321 addi.d t2, t2, -8 1322 bne zero, t2, .LSGRMIX_W 1323 1324 beq t4, zero, .LSGRMIX_W8 1325 1326 vld vr0, t0, 0 1327 vld vr1, t1, 0 1328 vld vr10, t3, 0 1329 vsllwil.hu.bu vr2, vr0, 4 // u 8 h 1330 vsllwil.wu.hu vr4, vr2, 0 // p 1331 vexth.wu.hu vr5, vr2 // p 1332 vslli.w vr6, vr4, 7 1333 vslli.w vr7, vr5, 7 1334 vsllwil.w.h vr8, vr1, 0 // dst 1335 vexth.w.h vr9, vr1 // dst 1336 vsub.w vr8, vr8, vr4 1337 vsub.w vr9, vr9, vr5 1338 vmadd.w vr6, vr8, vr3 // v 0 - 3 1339 vmadd.w vr7, vr9, vr3 // v 4 - 7 1340 1341 vsllwil.w.h vr11, vr10, 0 // dst1 1342 vexth.w.h vr12, vr10 // dst1 1343 vsub.w vr11, vr11, vr4 1344 vsub.w vr12, vr12, vr5 1345 vmadd.w vr6, vr11, vr13 1346 vmadd.w vr7, vr12, vr13 1347 1348 vssrarni.hu.w vr7, vr6, 11 1349 vssrlni.bu.h vr7, vr7, 0 1350 1351.LSGRMIX_ST: 1352 vstelm.b vr7, t0, 0, 0 1353 addi.d t0, t0, 1 1354 vbsrl.v vr7, vr7, 1 1355 addi.w t4, t4, -1 1356 bnez t4, .LSGRMIX_ST 1357 1358.LSGRMIX_W8: 1359 addi.w a7, a7, -1 1360 add.d a0, a0, a1 1361 addi.d a2, a2, (FILTER_OUT_STRIDE<<1) 1362 addi.d a3, a3, (FILTER_OUT_STRIDE<<1) 1363 bnez a7, .LSGRMIX_H 1364 b .LSGR_MIX_END 1365 1366.LSGRMIX_REM: 1367 andi t4, a6, 0x7 1368 vld vr0, a0, 0 1369 vld vr1, a2, 0 1370 vld vr10, a3, 0 1371 vsllwil.hu.bu vr2, vr0, 4 // u 8 h 1372 vsllwil.wu.hu vr4, vr2, 0 // p 1373 vexth.wu.hu vr5, vr2 // p 1374 vslli.w vr6, vr4, 7 1375 vslli.w vr7, vr5, 7 1376 vsllwil.w.h vr8, vr1, 0 // dst 1377 vexth.w.h vr9, vr1 // dst 1378 vsub.w vr8, vr8, vr4 1379 vsub.w vr9, vr9, vr5 1380 vmadd.w vr6, vr8, vr3 // v 0 - 3 1381 vmadd.w vr7, vr9, vr3 // v 4 - 7 1382 1383 vsllwil.w.h vr11, vr10, 0 // dst1 1384 vexth.w.h vr12, vr10 // dst1 1385 vsub.w vr11, vr11, vr4 1386 vsub.w vr12, vr12, vr5 1387 vmadd.w vr6, vr11, vr13 1388 vmadd.w vr7, vr12, vr13 1389 1390 vssrarni.hu.w vr7, vr6, 11 1391 vssrlni.bu.h vr7, vr7, 0 1392 addi.d t0, a0, 0 1393.LSGRMIX_REM_ST: 1394 vstelm.b vr7, t0, 0, 0 1395 addi.d t0, t0, 1 1396 vbsrl.v vr7, vr7, 1 1397 addi.w t4, t4, -1 1398 bnez t4, .LSGRMIX_REM_ST 1399 1400 addi.w a7, a7, -1 1401 add.d a0, a0, a1 1402 addi.d a2, a2, (FILTER_OUT_STRIDE<<1) 1403 addi.d a3, a3, (FILTER_OUT_STRIDE<<1) 1404 bnez a7, .LSGRMIX_REM 1405 1406.LSGR_MIX_END: 1407endfunc 1408