1/* 2 * Copyright © 2023, VideoLAN and dav1d authors 3 * Copyright © 2023, Loongson Technology Corporation Limited 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/loongarch/loongson_asm.S" 29 30.macro FILTER_W4 DIR, TYPE 31.ifc \DIR, h 32 addi.d t5, a0, -2 33 fld.s f6, t5, 0 //p1 p0 q0 q1 34 fldx.s f7, t5, a1 35 alsl.d t5, a1, t5, 1 36 fld.s f8, t5, 0 37 fldx.s f9, t5, a1 38 39 vilvl.b vr6, vr7, vr6 40 vilvl.b vr7, vr9, vr8 41 vilvl.h vr6, vr7, vr6 //p1p1p1p1 42 vbsrl.v vr7, vr6, 4 //p0p0p0p0 43 vbsrl.v vr8, vr7, 4 //q0q0q0q0 44 vbsrl.v vr9, vr8, 4 //q1q1q1q1 45.else 46 sub.d t5, a0, a1 47 fld.s f7, t5, 0 48 sub.d t5, t5, a1 49 fld.s f6, t5, 0 50 fld.s f8, a0, 0 51 fldx.s f9, a0, a1 52.endif 53 54 vabsd.bu vr10, vr6, vr7 // (p1 - p0) 55 vabsd.bu vr11, vr9, vr8 // (q1 - q0) 56 vabsd.bu vr12, vr7, vr8 // (p0 - q0) 57 vabsd.bu vr13, vr6, vr9 // (p1 - q1) 58 59 vmax.bu vr14, vr10, vr11 60 vsle.bu vr15, vr14, vr4 //abs(p1 - p0) <= I && abs(q1 - q0) <= I 61 vsadd.bu vr16, vr12, vr12 62 vsrli.b vr17, vr13, 1 63 vsadd.bu vr16, vr16, vr17 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) 64 vsle.bu vr16, vr16, vr3 65 vand.v vr20, vr15, vr16 //fm 66 67 vpickve2gr.wu t5, vr20, 0 68 beqz t5, .END_FILTER_\DIR\()\TYPE\()_W4 69 70 vslt.bu vr16, vr2, vr14 //hev 71 72 vsllwil.h.b vr30, vr20, 0 //expand fm to w 73 vsllwil.w.h vr30, vr30, 0 74 75 vsllwil.hu.bu vr17, vr6, 0 76 vsllwil.hu.bu vr18, vr9, 0 77 vsub.h vr17, vr17, vr18 78 vssrarni.b.h vr17, vr17, 0 //f = iclip_diff(p1 - q1) 79 80 vand.v vr17, vr17, vr16 81 vsllwil.h.b vr18, vr17, 0 82 83 vsllwil.hu.bu vr10, vr8, 0 84 vsllwil.hu.bu vr11, vr7, 0 85 vsub.h vr10, vr10, vr11 86 87 vsadd.h vr11, vr10, vr10 88 vsadd.h vr10, vr10, vr11 //3 * (q0 - p0) 89 vsadd.h vr10, vr10, vr18 //f = iclip_diff(3 * (q0 - p0) + f); 90 vssrani.b.h vr10, vr10, 0 91 vsllwil.h.b vr10, vr10, 0 92 93 vaddi.hu vr11, vr10, 4 94 vaddi.hu vr12, vr10, 3 95 li.w t5, 127 96 vreplgr2vr.h vr13, t5 97 vmin.h vr11, vr11, vr13 98 vmin.h vr12, vr12, vr13 99 vsrai.h vr11, vr11, 3 //f1 100 vsrai.h vr12, vr12, 3 //f2 101 102 vsllwil.hu.bu vr13, vr7, 0 //p0 103 vsllwil.hu.bu vr14, vr8, 0 //q0 104 vsadd.h vr13, vr13, vr12 105 vssub.h vr14, vr14, vr11 106 vssrani.bu.h vr13, vr13, 0 //dst-1 107 vssrani.bu.h vr14, vr14, 0 //dst+0 108 109 vsrari.h vr15, vr11, 1 //f 110 vsllwil.hu.bu vr18, vr6, 0 //p1 111 vsllwil.hu.bu vr19, vr9, 0 //q1 112 vsadd.h vr18, vr18, vr15 113 vssub.h vr19, vr19, vr15 114 vssrani.bu.h vr18, vr18, 0 //dst-2 115 vssrani.bu.h vr19, vr19, 0 //dst+1 116 vbitsel.v vr26, vr18, vr6, vr16 117 vbitsel.v vr29, vr19, vr9, vr16 118 119 vbitsel.v vr6, vr6, vr26, vr20 120 vbitsel.v vr7, vr7, vr13, vr20 121 vbitsel.v vr8, vr8, vr14, vr20 122 vbitsel.v vr9, vr9, vr29, vr20 123 124.ifc \DIR, h 125 vilvl.b vr6, vr7, vr6 126 vilvl.b vr9, vr9, vr8 127 vilvl.h vr6, vr9, vr6 128 129 addi.d t5, a0, -2 130 vstelm.w vr6, t5, 0, 0 131 add.d t5, t5, a1 132 vstelm.w vr6, t5, 0, 1 133 add.d t5, t5, a1 134 vstelm.w vr6, t5, 0, 2 135 add.d t5, t5, a1 136 vstelm.w vr6, t5, 0, 3 137.else 138 fst.s f8, a0, 0 139 fstx.s f9, a0, a1 140 sub.d t5, a0, a1 141 fst.s f7, t5, 0 142 sub.d t5, t5, a1 143 fst.s f6, t5, 0 144.endif 145.END_FILTER_\DIR\()\TYPE\()_W4: 146.endm 147 148.macro FILTER_W6 DIR, TYPE 149.ifc \DIR, h 150 addi.d t5, a0, -3 151 fld.d f6, t5, 0 //p2 p1 p0 q0 q1 q2 152 fldx.d f7, t5, a1 153 alsl.d t5, a1, t5, 1 154 fld.d f8, t5, 0 155 fldx.d f9, t5, a1 156 157 vilvl.b vr6, vr7, vr6 158 vilvl.b vr7, vr9, vr8 159 vilvh.h vr10, vr7, vr6 160 vilvl.h vr6, vr7, vr6 161 162 vbsrl.v vr7, vr6, 4 //p1 163 vbsrl.v vr8, vr7, 4 //p0 164 vbsrl.v vr9, vr8, 4 //q0 165 vbsrl.v vr11, vr10, 4 //q2 166.else 167 alsl.d t5, a1, a1, 1 168 sub.d t5, a0, t5 169 fld.d f6, t5, 0 170 fldx.d f7, t5, a1 171 alsl.d t5, a1, t5, 1 172 fld.d f8, t5, 0 173 fldx.d f9, t5, a1 174 alsl.d t5, a1, t5, 1 175 fld.d f10, t5, 0 176 fldx.d f11, t5, a1 177.endif 178 179 vabsd.bu vr12, vr7, vr8 //abs(p1-p0) 180 vabsd.bu vr13, vr10, vr9 //abs(q1-q0) 181 vmax.bu vr14, vr12, vr13 182 vslt.bu vr2, vr2, vr14 //hev 183 vabsd.bu vr12, vr6, vr7 //abs(p2-p1) 184 vmax.bu vr12, vr12, vr14 185 vabsd.bu vr13, vr11, vr10 //abs(q2-q1) 186 vmax.bu vr12, vr12, vr13 187 vsle.bu vr0, vr12, vr4 // <=I 188 189 vabsd.bu vr13, vr8, vr9 //abs(p0-q0) 190 vsadd.bu vr13, vr13, vr13 191 vabsd.bu vr15, vr7, vr10 192 vsrli.b vr15, vr15, 1 193 vsadd.bu vr13, vr13, vr15 194 vsle.bu vr13, vr13, vr3 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E 195 vand.v vr0, vr0, vr13 //fm 196 197 vpickve2gr.wu t5, vr0, 0 198 beqz t5, .END_FILTER_\DIR\()\TYPE\()_W6 199 200 vabsd.bu vr12, vr6, vr8 //abs(p2-p0) 201 vabsd.bu vr13, vr11, vr9 //abs(q2-q0) 202 vmax.bu vr12, vr12, vr14 203 vmax.bu vr12, vr12, vr13 204 vxor.v vr13, vr13, vr13 205 vaddi.bu vr13, vr13, 1 206 vsle.bu vr1, vr12, vr13 //flat8in 207 208 //6789 10 11 --expand to h 209 vsllwil.hu.bu vr12, vr6, 0 210 vsllwil.hu.bu vr13, vr7, 0 211 vsllwil.hu.bu vr14, vr8, 0 212 vsllwil.hu.bu vr15, vr9, 0 213 vsllwil.hu.bu vr16, vr10, 0 214 vsllwil.hu.bu vr17, vr11, 0 215 216 //dst-2 217 vsadd.hu vr18, vr12, vr12 218 vsadd.hu vr18, vr18, vr12 219 vsadd.hu vr18, vr18, vr13 220 vsadd.hu vr18, vr18, vr13 221 vsadd.hu vr18, vr18, vr14 222 vsadd.hu vr18, vr18, vr14 223 vsadd.hu vr18, vr18, vr15 224 225 //dst-1 226 vsadd.hu vr19, vr18, vr15 227 vsadd.hu vr19, vr19, vr16 228 vssub.hu vr19, vr19, vr12 229 vssub.hu vr19, vr19, vr12 230 231 //dst+0 232 vsadd.hu vr20, vr19, vr17 233 vsadd.hu vr20, vr20, vr16 234 vssub.hu vr20, vr20, vr12 235 vssub.hu vr20, vr20, vr13 236 237 //dst+1 238 vsadd.hu vr21, vr20, vr17 239 vsadd.hu vr21, vr21, vr17 240 vssub.hu vr21, vr21, vr13 241 vssub.hu vr21, vr21, vr14 242 243 vsrari.h vr18, vr18, 3 244 vsrari.h vr19, vr19, 3 245 vsrari.h vr20, vr20, 3 246 vsrari.h vr21, vr21, 3 247 248 vsub.h vr22, vr13, vr16 249 vssrani.b.h vr22, vr22, 0 250 vand.v vr22, vr22, vr2 251 vsllwil.h.b vr22, vr22, 0 //f = iclip_diff(p1 - q1); 252 253 vsub.h vr23, vr15, vr14 254 vsadd.h vr24, vr23, vr23 255 vsadd.h vr23, vr23, vr24 256 vsadd.h vr23, vr23, vr22 257 vssrani.b.h vr23, vr23, 0 258 vsllwil.h.b vr23, vr23, 0 //f = iclip_diff(3 * (q0 - p0) + f); 259 260 vaddi.hu vr24, vr23, 4 261 vaddi.hu vr25, vr23, 3 262 li.w t5, 127 263 vreplgr2vr.h vr3, t5 264 vmin.h vr24, vr24, vr3 265 vmin.h vr25, vr25, vr3 266 vsrai.h vr24, vr24, 3 //f1 267 vsrai.h vr25, vr25, 3 //f2 268 269 vsadd.h vr26, vr14, vr25 //dst-1 270 vssub.h vr27, vr15, vr24 //dst+0 271 272 vsrari.h vr24, vr24, 1 273 vsadd.h vr28, vr13, vr24 274 vssub.h vr29, vr16, vr24 275 vsllwil.h.b vr2, vr2, 0 276 vbitsel.v vr28, vr28, vr13, vr2 //dst-2 277 vbitsel.v vr29, vr29, vr16, vr2 //dst+1 278 279 //flat8in 280 vsllwil.h.b vr1, vr1, 0 281 vbitsel.v vr18, vr28, vr18, vr1 282 vbitsel.v vr19, vr26, vr19, vr1 283 vbitsel.v vr20, vr27, vr20, vr1 284 vbitsel.v vr21, vr29, vr21, vr1 285 286 vssrani.bu.h vr18, vr18, 0 287 vssrani.bu.h vr19, vr19, 0 288 vssrani.bu.h vr20, vr20, 0 289 vssrani.bu.h vr21, vr21, 0 290 291 vbitsel.v vr7, vr7, vr18, vr0 //p1 292 vbitsel.v vr8, vr8, vr19, vr0 //p0 293 vbitsel.v vr9, vr9, vr20, vr0 //q0 294 vbitsel.v vr10, vr10, vr21, vr0 //q1 295 296.ifc \DIR, h 297 vilvl.b vr7, vr8, vr7 298 vilvl.b vr9, vr10, vr9 299 vilvl.h vr7, vr9, vr7 300 301 addi.d t5, a0, -2 302 vstelm.w vr7, t5, 0, 0 303 add.d t5, t5, a1 304 vstelm.w vr7, t5, 0, 1 305 add.d t5, t5, a1 306 vstelm.w vr7, t5, 0, 2 307 add.d t5, t5, a1 308 vstelm.w vr7, t5, 0, 3 309.else 310 fst.s f9, a0, 0 311 fstx.s f10, a0, a1 312 sub.d t5, a0, a1 313 fst.s f8, t5, 0 314 sub.d t5, t5, a1 315 fst.s f7, t5, 0 316.endif 317.END_FILTER_\DIR\()\TYPE\()_W6: 318.endm 319 320.macro FILTER_W8 DIR, TYPE 321.ifc \DIR, h 322 addi.d t5, a0, -4 323 fld.d f6, t5, 0 //p3 p2 p1 p0 q0 q1 q2 q3 324 fldx.d f7, t5, a1 325 alsl.d t5, a1, t5, 1 326 fld.d f8, t5, 0 327 fldx.d f9, t5, a1 328 329 vilvl.b vr6, vr7, vr6 330 vilvl.b vr7, vr9, vr8 331 vilvh.h vr10, vr7, vr6 //q0 332 vilvl.h vr6, vr7, vr6 //p3 333 vbsrl.v vr7, vr6, 4 //p2 334 vbsrl.v vr8, vr6, 8 //p1 335 vbsrl.v vr9, vr6, 12 //p0 336 vbsrl.v vr11, vr10, 4 //q1 337 vbsrl.v vr12, vr10, 8 //q2 338 vbsrl.v vr13, vr10, 12 //q3 339.else 340 fld.s f10, a0, 0 341 fldx.s f11, a0, a1 342 add.d t5, a0, a1 343 fldx.s f12, t5, a1 344 add.d t5, t5, a1 345 fldx.s f13, t5, a1 346 sub.d t5, a0, a1 347 fld.s f9, t5, 0 348 sub.d t5, t5, a1 349 fld.s f8, t5, 0 350 sub.d t5, t5, a1 351 fld.s f7, t5, 0 352 sub.d t5, t5, a1 353 fld.s f6, t5, 0 354.endif 355 356 vabsd.bu vr14, vr8, vr9 //p1-p0 357 vabsd.bu vr15, vr11, vr10 //q1-q0 358 vabsd.bu vr16, vr9, vr10 //p0-q0 359 vabsd.bu vr17, vr8, vr11 //p1-q1 360 vabsd.bu vr18, vr7, vr8 //p2-p1 361 vabsd.bu vr19, vr12, vr11 //q2-q1 362 vabsd.bu vr20, vr6, vr7 //p3-p2 363 vabsd.bu vr21, vr13, vr12 //q3-q2 364 365 vmax.bu vr22, vr14, vr15 366 vsle.bu vr23, vr22, vr4 //abs(p1 - p0) <= I && abs(q1 - q0) <= I 367 vsadd.bu vr16, vr16, vr16 368 vsrli.b vr17, vr17, 1 369 vsadd.bu vr16, vr16, vr17 370 vsle.bu vr16, vr16, vr3 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E 371 vand.v vr16, vr16, vr23 //fm 372 373 vpickve2gr.wu t5, vr16, 0 374 beqz t5, .END_FILTER_\DIR\()\TYPE\()_W8 375 376 vmax.bu vr23, vr18, vr19 377 vmax.bu vr23, vr23, vr20 378 vmax.bu vr23, vr23, vr21 379 vsle.bu vr23, vr23, vr4 380 vand.v vr16, vr16, vr23 //fm 381 382 vabsd.bu vr17, vr7, vr9 //abs(p2-p0) 383 vabsd.bu vr18, vr12, vr10 //abs(q2-q0) 384 vmax.bu vr17, vr17, vr14 385 vmax.bu vr17, vr17, vr15 386 vmax.bu vr17, vr17, vr18 387 vabsd.bu vr18, vr6, vr9 //abs(p3 - p0) 388 vabsd.bu vr19, vr13, vr10 //abs(q3 - q0) 389 vmax.bu vr17, vr17, vr18 390 vmax.bu vr17, vr17, vr19 391 392 vxor.v vr5, vr5, vr5 393 vaddi.bu vr5, vr5, 1 //F 394 vsle.bu vr17, vr17, vr5 //flat8in 395 396 vsllwil.hu.bu vr0, vr6, 0 //p3 397 vsllwil.hu.bu vr1, vr7, 0 //p2 398 vsllwil.hu.bu vr27, vr8, 0 //p1 399 vsllwil.hu.bu vr3, vr9, 0 //p0 400 vsllwil.hu.bu vr4, vr10, 0 //q0 401 vsllwil.hu.bu vr5, vr11, 0 //q1 402 vsllwil.hu.bu vr14, vr12, 0 //q2 403 vsllwil.hu.bu vr15, vr13, 0 //q3 404 405 vsadd.hu vr18, vr0, vr0 //p3+p3 406 vsadd.hu vr19, vr15, vr15 //q3+q3 407 vsadd.hu vr20, vr0, vr1 //p3+p2 408 vsadd.hu vr21, vr1, vr27 //p2+p1 409 vsadd.hu vr28, vr27, vr3 //p1+p0 410 vsadd.hu vr23, vr3, vr4 //p0+q0 411 vsadd.hu vr24, vr4, vr5 //q0+q1 412 vsadd.hu vr25, vr5, vr14 //q1+q2 413 vsadd.hu vr26, vr14, vr15 //q2+q3 414 415 // dst-3 416 vsadd.hu vr29, vr18, vr20 417 vsadd.hu vr29, vr29, vr21 418 vsadd.hu vr29, vr29, vr23 419 420 // dst-2 421 vsadd.hu vr30, vr18, vr21 422 vsadd.hu vr30, vr30, vr28 423 vsadd.hu vr30, vr30, vr24 424 425 // dst-1 426 vsadd.hu vr31, vr20, vr28 427 vsadd.hu vr31, vr31, vr23 428 vsadd.hu vr31, vr31, vr25 429 430 // dst+0 431 vsadd.hu vr18, vr21, vr23 432 vsadd.hu vr18, vr18, vr24 433 vsadd.hu vr18, vr18, vr26 434 435 //dst+1 436 vsadd.hu vr20, vr28, vr24 437 vsadd.hu vr20, vr20, vr25 438 vsadd.hu vr20, vr20, vr19 439 440 //dst+2 441 vsadd.hu vr21, vr23, vr25 442 vsadd.hu vr21, vr21, vr26 443 vsadd.hu vr21, vr21, vr19 444 445 vssrarni.bu.h vr23, vr29, 3 446 vssrarni.bu.h vr24, vr30, 3 447 vssrarni.bu.h vr25, vr31, 3 448 vssrarni.bu.h vr19, vr18, 3 449 vssrarni.bu.h vr20, vr20, 3 450 vssrarni.bu.h vr21, vr21, 3 451 452 // !flat8in 453 vslt.bu vr2, vr2, vr22 //hev 454 455 vsub.h vr30, vr27, vr5 //p1-q1 456 vssrani.b.h vr30, vr30, 0 457 vand.v vr30, vr30, vr2 458 vsllwil.h.b vr30, vr30, 0 459 460 vsub.h vr31, vr4, vr3 461 vsadd.h vr0, vr31, vr31 462 vsadd.h vr31, vr31, vr0 463 vsadd.h vr31, vr31, vr30 464 vssrani.b.h vr31, vr31, 0 465 vsllwil.h.b vr31, vr31, 0 //f = iclip_diff(3 * (q0 - p0) + f); 466 467 vaddi.hu vr14, vr31, 4 468 vaddi.hu vr15, vr31, 3 469 li.w t5, 127 470 vreplgr2vr.h vr18, t5 471 vmin.h vr14, vr14, vr18 472 vmin.h vr15, vr15, vr18 473 vsrai.h vr14, vr14, 3 //f1 474 vsrai.h vr15, vr15, 3 //f2 475 476 vsadd.h vr3, vr3, vr15 477 vssub.h vr4, vr4, vr14 478 vssrani.bu.h vr3, vr3, 0 //dst-1 479 vssrani.bu.h vr4, vr4, 0 //dst+0 480 481 vsrari.h vr14, vr14, 1 482 vsadd.h vr18, vr27, vr14 483 vssub.h vr26, vr5, vr14 484 vssrani.bu.h vr18, vr18, 0 //dst-2 485 vssrani.bu.h vr26, vr26, 0 //dst+1 486 487 vbitsel.v vr27, vr18, vr8, vr2 //dst-2 488 vbitsel.v vr28, vr26, vr11, vr2 //dst+1 489 490 vbitsel.v vr23, vr7, vr23, vr17 //dst-3 (p2) 491 vbitsel.v vr24, vr27, vr24, vr17 //dst-2 492 vbitsel.v vr25, vr3, vr25, vr17 //dst-1 493 vbitsel.v vr19, vr4, vr19, vr17 //dst+0 494 vbitsel.v vr20, vr28, vr20, vr17 //dst+1 495 vbitsel.v vr21, vr12, vr21, vr17 //dst+2 496 497 vbitsel.v vr7, vr7, vr23, vr16 //-3 498 vbitsel.v vr8, vr8, vr24, vr16 //-2 499 vbitsel.v vr9, vr9, vr25, vr16 //-1 500 vbitsel.v vr10, vr10, vr19, vr16 //+0 501 vbitsel.v vr11, vr11, vr20, vr16 //+1 502 vbitsel.v vr12, vr12, vr21, vr16 //+2 503 504.ifc \DIR, h 505 vilvl.b vr6, vr7, vr6 506 vilvl.b vr8, vr9, vr8 507 vilvl.b vr10, vr11, vr10 508 vilvl.b vr12, vr13, vr12 509 vilvl.h vr6, vr8, vr6 //p3p2p1p0 -- -- -- 510 vilvl.h vr10, vr12, vr10 //q0q1q2q3 -- -- -- 511 vilvl.w vr0, vr10, vr6 //p3p2p1p0q0q1q2q3 -- 512 vilvh.w vr1, vr10, vr6 //-- 513 514 addi.d t5, a0, -4 515 vstelm.d vr0, t5, 0, 0 516 add.d t5, t5, a1 517 vstelm.d vr0, t5, 0, 1 518 add.d t5, t5, a1 519 vstelm.d vr1, t5, 0, 0 520 add.d t5, t5, a1 521 vstelm.d vr1, t5, 0, 1 522.else 523 alsl.d t5, a1, a1, 1 524 sub.d t5, a0, t5 525 fst.s f7, t5, 0 526 fstx.s f8, t5, a1 527 add.d t5, t5, a1 528 fstx.s f9, t5, a1 529 530 fst.s f10, a0, 0 531 add.d t5, a0, a1 532 fst.s f11, t5, 0 533 fstx.s f12, t5, a1 534.endif 535.END_FILTER_\DIR\()\TYPE\()_W8: 536.endm 537 538.macro FILTER_W16 DIR, TYPE 539.ifc \DIR, h 540 addi.d t5, a0, -7 541 vld vr6, t5, 0 //p6p5p4p3p2p1p0q0 q1q2q3q4q5q6 542 vldx vr7, t5, a1 543 add.d t5, t5, a1 544 vldx vr8, t5, a1 545 add.d t5, t5, a1 546 vldx vr9, t5, a1 547 548 vilvl.b vr10, vr7, vr6 549 vilvh.b vr11, vr7, vr6 550 vilvl.b vr12, vr9, vr8 551 vilvh.b vr13, vr9, vr8 552 vilvl.h vr6, vr12, vr10 553 vilvh.h vr10, vr12, vr10 //p2--- 554 vilvl.h vr15, vr13, vr11 //q1--- 555 vilvh.h vr19, vr13, vr11 556 557 vbsrl.v vr7, vr6, 4 //p5--- 558 vbsrl.v vr8, vr6, 8 //p4--- 559 vbsrl.v vr9, vr6, 12 //p3--- 560 vbsrl.v vr12, vr10, 4 //p1--- 561 vbsrl.v vr13, vr10, 8 //p0--- 562 vbsrl.v vr14, vr10, 12 //q0--- 563 vbsrl.v vr16, vr15, 4 //q2--- 564 vbsrl.v vr17, vr15, 8 //q3--- 565 vbsrl.v vr18, vr15, 12 //q4--- 566 vbsrl.v vr20, vr19, 4 //q6--- 567.else 568 slli.d t5, a1, 3 569 sub.d t5, a0, t5 570 fldx.s f6, t5, a1 //p6 571 alsl.d t5, a1, t5, 1 572 fld.s f7, t5, 0 //p5 573 fldx.s f8, t5, a1 //p4 574 alsl.d t5, a1, t5, 1 575 fld.s f9, t5, 0 //p3 576 fldx.s f10, t5, a1 //p2 577 alsl.d t5, a1, t5, 1 578 fld.s f12, t5, 0 //p1 579 fldx.s f13, t5, a1 //p0 580 alsl.d t5, a1, t5, 1 581 fld.s f14, t5, 0 //q0 582 fldx.s f15, t5, a1 //q1 583 alsl.d t5, a1, t5, 1 584 fld.s f16, t5, 0 //q2 585 fldx.s f17, t5, a1 //q3 586 alsl.d t5, a1, t5, 1 587 fld.s f18, t5, 0 //q4 588 fldx.s f19, t5, a1 //q5 589 add.d t5, t5, a1 590 fldx.s f20, t5, a1 //q6 591 592 //temp store 593 addi.d sp, sp, -96 594 fst.d f7, sp, 0 595 fst.d f8, sp, 8 596 fst.d f9, sp, 16 597 fst.d f10, sp, 24 598 fst.d f12, sp, 32 599 fst.d f13, sp, 40 600 fst.d f14, sp, 48 601 fst.d f15, sp, 56 602 fst.d f16, sp, 64 603 fst.d f17, sp, 72 604 fst.d f18, sp, 80 605 fst.d f19, sp, 88 606.endif 607 608 vabsd.bu vr21, vr12, vr13 //abs(p1-p0) 609 vabsd.bu vr22, vr15, vr14 //abs(q1-q0) 610 vmax.bu vr0, vr21, vr22 611 vslt.bu vr2, vr2, vr0 //hev 612 vabsd.bu vr1, vr10, vr12 //abs(p2-p1) 613 vmax.bu vr0, vr0, vr1 614 vabsd.bu vr1, vr16, vr15 //abs(q2-q1) 615 vmax.bu vr0, vr0, vr1 616 vabsd.bu vr1, vr9, vr10 //abs(p3-p2) 617 vmax.bu vr0, vr0, vr1 618 vabsd.bu vr1, vr17, vr16 //abs(q3-q2) 619 vmax.bu vr0, vr0, vr1 620 vsle.bu vr0, vr0, vr4 //vr4 released I 621 vabsd.bu vr1, vr13, vr14 //abs(p0-q0) 622 vsadd.bu vr1, vr1, vr1 623 vabsd.bu vr4, vr12, vr15 //abs(p1-q1) 624 vsrli.b vr4, vr4, 1 625 vsadd.bu vr1, vr1, vr4 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) 626 vsle.bu vr1, vr1, vr3 //vr3 released E 627 vand.v vr0, vr0, vr1 //fm 628 629 vpickve2gr.wu t5, vr0, 0 630 beqz t5, .END_FILTER_\DIR\()\TYPE\()_W16 631 632 vabsd.bu vr1, vr6, vr13 //abs(p6-p0) 633 vabsd.bu vr4, vr7, vr13 //abs(p5-p0) 634 vmax.bu vr1, vr1, vr4 635 vabsd.bu vr4, vr8, vr13 //abs(p4-p0) 636 vmax.bu vr1, vr1, vr4 637 vabsd.bu vr4, vr18, vr14 //abs(q4-q0) 638 vmax.bu vr1, vr1, vr4 639 vabsd.bu vr4, vr19, vr14 //abs(q5-q0) 640 vmax.bu vr1, vr1, vr4 641 vabsd.bu vr4, vr20, vr14 642 vmax.bu vr1, vr1, vr4 643 vxor.v vr5, vr5, vr5 644 vaddi.bu vr5, vr5, 1 //F 645 vsle.bu vr1, vr1, vr5 //flat8out 646 647 vabsd.bu vr3, vr10, vr13 //abs(p2-p0) 648 vmax.bu vr3, vr3, vr21 649 vmax.bu vr3, vr3, vr22 650 vabsd.bu vr4, vr16, vr14 //abs(q2-q0) 651 vmax.bu vr3, vr3, vr4 652 vabsd.bu vr4, vr9, vr13 //abs(p3-p0) 653 vmax.bu vr3, vr3, vr4 654 vabsd.bu vr4, vr17, vr14 //abs(q3-q0) 655 vmax.bu vr3, vr3, vr4 656 vsle.bu vr3, vr3, vr5 //flatin released vr5 657 658 vsllwil.hu.bu vr6, vr6, 0 //p6 659 vsllwil.hu.bu vr7, vr7, 0 //p5 660 vsllwil.hu.bu vr8, vr8, 0 //p4 661 vsllwil.hu.bu vr9, vr9, 0 //p3 662 vsllwil.hu.bu vr10, vr10, 0 //p2 663 vsllwil.hu.bu vr12, vr12, 0 //p1 664 vsllwil.hu.bu vr13, vr13, 0 //p0 665 vsllwil.hu.bu vr14, vr14, 0 //q0 666 vsllwil.hu.bu vr15, vr15, 0 //q1 667 vsllwil.hu.bu vr16, vr16, 0 //q2 668 vsllwil.hu.bu vr17, vr17, 0 //q3 669 vsllwil.hu.bu vr18, vr18, 0 //q4 670 vsllwil.hu.bu vr19, vr19, 0 //q5 671 vsllwil.hu.bu vr20, vr20, 0 //q6 672 673 //dst-6 674 vslli.w vr21, vr6, 3 675 vssub.hu vr21, vr21, vr6 676 vsadd.hu vr21, vr21, vr7 677 vsadd.hu vr21, vr21, vr7 678 vsadd.hu vr21, vr21, vr8 679 vsadd.hu vr21, vr21, vr8 680 vsadd.hu vr21, vr21, vr9 681 vsadd.hu vr21, vr21, vr10 682 vsadd.hu vr21, vr21, vr12 683 vsadd.hu vr21, vr21, vr13 684 vsadd.hu vr21, vr21, vr14 685 686 //dst-5 687 vsadd.hu vr22, vr21, vr15 688 vsadd.hu vr22, vr22, vr9 689 vssub.hu vr22, vr22, vr6 690 vssub.hu vr22, vr22, vr6 691 692 //dst-4 693 vsadd.hu vr23, vr22, vr16 694 vsadd.hu vr23, vr23, vr10 695 vssub.hu vr23, vr23, vr7 696 vssub.hu vr23, vr23, vr6 697 698 //dst-3 699 vsadd.hu vr24, vr23, vr12 700 vsadd.hu vr24, vr24, vr17 701 vssub.hu vr24, vr24, vr6 702 vssub.hu vr24, vr24, vr8 703 704 //dst-2 705 vsadd.hu vr25, vr24, vr18 706 vsadd.hu vr25, vr25, vr13 707 vssub.hu vr25, vr25, vr6 708 vssub.hu vr25, vr25, vr9 709 710 //dst-1 711 vsadd.hu vr26, vr25, vr19 712 vsadd.hu vr26, vr26, vr14 713 vssub.hu vr26, vr26, vr6 714 vssub.hu vr26, vr26, vr10 715 716 //dst+0 717 vsadd.hu vr27, vr26, vr20 718 vsadd.hu vr27, vr27, vr15 719 vssub.hu vr27, vr27, vr6 720 vssub.hu vr27, vr27, vr12 721 722 //dst+1 723 vsadd.hu vr28, vr27, vr20 724 vsadd.hu vr28, vr28, vr16 725 vssub.hu vr28, vr28, vr7 726 vssub.hu vr28, vr28, vr13 727 728 //dst+2 729 vsadd.hu vr29, vr28, vr20 730 vsadd.hu vr29, vr29, vr17 731 vssub.hu vr29, vr29, vr8 732 vssub.hu vr29, vr29, vr14 733 734 //dst+3 735 vsadd.hu vr30, vr29, vr20 736 vsadd.hu vr30, vr30, vr18 737 vssub.hu vr30, vr30, vr9 738 vssub.hu vr30, vr30, vr15 739 740 //dst+4 741 vsadd.hu vr31, vr30, vr20 742 vsadd.hu vr31, vr31, vr19 743 vssub.hu vr31, vr31, vr10 744 vssub.hu vr31, vr31, vr16 745 746 //dst+5 747 vsadd.hu vr11, vr31, vr20 748 vsadd.hu vr11, vr11, vr20 749 vssub.hu vr11, vr11, vr12 750 vssub.hu vr11, vr11, vr17 751 752 vsrari.h vr21, vr21, 4 753 vsrari.h vr22, vr22, 4 754 vsrari.h vr23, vr23, 4 755 vsrari.h vr24, vr24, 4 756 vsrari.h vr25, vr25, 4 757 vsrari.h vr26, vr26, 4 758 vsrari.h vr27, vr27, 4 759 vsrari.h vr28, vr28, 4 760 vsrari.h vr29, vr29, 4 761 vsrari.h vr30, vr30, 4 762 vsrari.h vr31, vr31, 4 763 vsrari.h vr11, vr11, 4 764 765 vand.v vr1, vr1, vr3 766 vsllwil.h.b vr1, vr1, 0 //expand to h 767 //(flat8out & flat8in) 768 vbitsel.v vr21, vr7, vr21, vr1 //dst-6 769 vbitsel.v vr22, vr8, vr22, vr1 //dst-5 770 vbitsel.v vr23, vr9, vr23, vr1 //dst-4 771 vbitsel.v vr30, vr17, vr30, vr1 //dst+3 772 vbitsel.v vr31, vr18, vr31, vr1 //dst+4 773 vbitsel.v vr11, vr19, vr11, vr1 //dst+5 774 775 //flat8in 776 //dst-3 777 vslli.h vr4, vr9, 1 778 vsadd.hu vr4, vr4, vr9 //p3*3 779 vsadd.hu vr4, vr4, vr10 780 vsadd.hu vr4, vr4, vr10 781 vsadd.hu vr4, vr4, vr12 782 vsadd.hu vr4, vr4, vr13 783 vsadd.hu vr4, vr4, vr14 784 785 //dst-2 786 vsadd.hu vr5, vr4, vr12 787 vsadd.hu vr5, vr5, vr15 788 vssub.hu vr5, vr5, vr9 789 vssub.hu vr5, vr5, vr10 790 791 //dst-1 792 vsadd.hu vr18, vr5, vr13 793 vsadd.hu vr18, vr18, vr16 794 vssub.hu vr18, vr18, vr9 795 vssub.hu vr18, vr18, vr12 796 797 //dst+0 798 vsadd.hu vr7, vr18, vr14 799 vsadd.hu vr7, vr7, vr17 800 vssub.hu vr7, vr7, vr9 801 vssub.hu vr7, vr7, vr13 802 803 //dst+1 804 vsadd.hu vr8, vr7, vr15 805 vsadd.hu vr8, vr8, vr17 806 vssub.hu vr8, vr8, vr10 807 vssub.hu vr8, vr8, vr14 808 809 //dst+2 810 vsadd.hu vr9, vr8, vr16 811 vsadd.hu vr9, vr9, vr17 812 vssub.hu vr9, vr9, vr12 813 vssub.hu vr9, vr9, vr15 814 815 vsrari.h vr4, vr4, 3 816 vsrari.h vr5, vr5, 3 817 vsrari.h vr18, vr18, 3 818 vsrari.h vr7, vr7, 3 819 vsrari.h vr8, vr8, 3 820 vsrari.h vr9, vr9, 3 821 822 //flat8out & flat8in 823 vbitsel.v vr24, vr4, vr24, vr1 //dst-3 824 vbitsel.v vr25, vr5, vr25, vr1 //dst-2 825 vbitsel.v vr26, vr18, vr26, vr1 //dst-1 826 vbitsel.v vr27, vr7, vr27, vr1 //dst+0 827 vbitsel.v vr28, vr8, vr28, vr1 //dst+1 828 vbitsel.v vr29, vr9, vr29, vr1 //dst+2 829 830 //!flat8in 831 vsub.h vr17, vr12, vr15 //p1-q1 832 vsllwil.h.b vr2, vr2, 0 833 vand.v vr17, vr17, vr2 //&hev 834 vssrani.b.h vr17, vr17, 0 835 vsllwil.h.b vr17, vr17, 0 836 837 vsub.h vr7, vr14, vr13 838 vsadd.h vr8, vr7, vr7 839 vsadd.h vr7, vr7, vr8 840 vsadd.h vr7, vr7, vr17 841 vssrani.b.h vr7, vr7, 0 842 vsllwil.h.b vr17, vr7, 0 //f = iclip_diff(3 * (q0 - p0) + f); 843 844 vaddi.hu vr7, vr17, 4 845 vaddi.hu vr8, vr17, 3 846 li.w t5, 127 847 vreplgr2vr.h vr9, t5 848 vmin.h vr7, vr7, vr9 849 vmin.h vr8, vr8, vr9 850 vsrai.h vr7, vr7, 3 //f1 851 vsrai.h vr8, vr8, 3 //f2 852 853 vsadd.h vr4, vr13, vr8 //dst-1 854 vssub.h vr5, vr14, vr7 //dst+0 855 856 vsrari.h vr7, vr7, 1 857 vsadd.h vr17, vr12, vr7 858 vssub.h vr7, vr15, vr7 859 vbitsel.v vr17, vr17, vr12, vr2 //dst-2 860 vbitsel.v vr7, vr7, vr15, vr2 //dst+1 861 862 //flat8in or !flat8in 863 vsllwil.h.b vr3, vr3, 0 864 vbitsel.v vr24, vr10, vr24, vr3 //dst-3 865 vbitsel.v vr25, vr17, vr25, vr3 //dst-2 866 vbitsel.v vr26, vr4, vr26, vr3 //dst-1 867 vbitsel.v vr27, vr5, vr27, vr3 //dst+0 868 vbitsel.v vr28, vr7, vr28, vr3 //dst+1 869 vbitsel.v vr29, vr16, vr29, vr3 //dst+2 870 871.ifc \DIR, h 872 //dst-6,dst-2,dst-5,dst-1 873 vssrani.bu.h vr25, vr21, 0 874 vssrani.bu.h vr26, vr22, 0 875 vpermi.w vr25, vr25, 0xd8 876 vpermi.w vr26, vr26, 0xd8 877 vilvl.b vr6, vr26, vr25 //65656565 21212121 878 879 //dst-4,dst+0,dst-3,dst+1 880 vssrani.bu.h vr27, vr23, 0 881 vssrani.bu.h vr28, vr24, 0 882 vpermi.w vr27, vr27, 0xd8 883 vpermi.w vr28, vr28, 0xd8 884 vilvl.b vr26, vr28, vr27 //43434343 01010101 885 886 vilvl.h vr21, vr26, vr6 //6543 -- -- -- 887 vilvh.h vr22, vr26, vr6 //2101 -- -- -- 888 vilvl.w vr20, vr22, vr21 //65432101 -- 889 vilvh.w vr22, vr22, vr21 //65432101 -- 890 vreplvei.d vr21, vr20, 1 891 vreplvei.d vr23, vr22, 1 892 893 //dst+2,dst+4,dst+3,dst+5 894 vssrani.bu.h vr31, vr29, 0 895 vssrani.bu.h vr11, vr30, 0 896 vpermi.w vr31, vr31, 0xd8 897 vpermi.w vr11, vr11, 0xd8 898 vilvl.b vr11, vr11, vr31 //23232323 45454545 899 vshuf4i.w vr11, vr11, 0xd8 900 vshuf4i.h vr11, vr11, 0xd8 //2345 -- -- -- 901 902 vextrins.w vr20, vr11, 0x20 903 vextrins.w vr21, vr11, 0x21 904 vextrins.w vr22, vr11, 0x22 905 vextrins.w vr23, vr11, 0x23 906 907 addi.d t5, a0, -6 908 vld vr6, t5, 0 //p6p5p4p3p2p1p0q0 q1q2q3q4q5q6 909 vldx vr7, t5, a1 910 add.d t5, t5, a1 911 vldx vr8, t5, a1 912 add.d t5, t5, a1 913 vldx vr9, t5, a1 914 915 //expand fm to 128 916 vreplvei.b vr10, vr0, 0 917 vreplvei.b vr11, vr0, 1 918 vreplvei.b vr12, vr0, 2 919 vreplvei.b vr13, vr0, 3 920 921 vbitsel.v vr20, vr6, vr20, vr10 922 vbitsel.v vr21, vr7, vr21, vr11 923 vbitsel.v vr22, vr8, vr22, vr12 924 vbitsel.v vr23, vr9, vr23, vr13 925 926 addi.d t5, a0, -6 927 vstelm.d vr20, t5, 0, 0 928 vstelm.w vr20, t5, 8, 2 929 add.d t5, t5, a1 930 vstelm.d vr21, t5, 0, 0 931 vstelm.w vr21, t5, 8, 2 932 add.d t5, t5, a1 933 vstelm.d vr22, t5, 0, 0 934 vstelm.w vr22, t5, 8, 2 935 add.d t5, t5, a1 936 vstelm.d vr23, t5, 0, 0 937 vstelm.w vr23, t5, 8, 2 938.else 939 //reload 940 fld.d f7, sp, 0 941 fld.d f8, sp, 8 942 fld.d f9, sp, 16 943 fld.d f10, sp, 24 944 fld.d f12, sp, 32 945 fld.d f13, sp, 40 946 fld.d f14, sp, 48 947 fld.d f15, sp, 56 948 fld.d f16, sp, 64 949 fld.d f17, sp, 72 950 fld.d f18, sp, 80 951 fld.d f19, sp, 88 952 953 vssrarni.bu.h vr21, vr21, 0 954 vssrarni.bu.h vr22, vr22, 0 955 vssrarni.bu.h vr23, vr23, 0 956 vssrarni.bu.h vr24, vr24, 0 957 vssrarni.bu.h vr25, vr25, 0 958 vssrarni.bu.h vr26, vr26, 0 959 vssrarni.bu.h vr27, vr27, 0 960 vssrarni.bu.h vr28, vr28, 0 961 vssrarni.bu.h vr29, vr29, 0 962 vssrarni.bu.h vr30, vr30, 0 963 vssrarni.bu.h vr31, vr31, 0 964 vssrarni.bu.h vr11, vr11, 0 965 966 vbitsel.v vr7, vr7, vr21, vr0 //p5 967 vbitsel.v vr8, vr8, vr22, vr0 //p4 968 vbitsel.v vr9, vr9, vr23, vr0 //p3 969 vbitsel.v vr10, vr10, vr24, vr0 //p2 970 vbitsel.v vr12, vr12, vr25, vr0 //p1 971 vbitsel.v vr13, vr13, vr26, vr0 //p0 972 vbitsel.v vr14, vr14, vr27, vr0 //q0 973 vbitsel.v vr15, vr15, vr28, vr0 //q1 974 vbitsel.v vr16, vr16, vr29, vr0 //q2 975 vbitsel.v vr17, vr17, vr30, vr0 //q3 976 vbitsel.v vr18, vr18, vr31, vr0 //q4 977 vbitsel.v vr19, vr19, vr11, vr0 //q5 978 979 fst.s f14, a0, 0 980 fstx.s f15, a0, a1 981 alsl.d t5, a1, a0, 1 982 fst.s f16, t5, 0 983 fstx.s f17, t5, a1 984 alsl.d t5, a1, t5, 1 985 fst.s f18, t5, 0 986 fstx.s f19, t5, a1 987 988 slli.w t5, a1, 2 989 alsl.d t5, a1, t5, 1 990 sub.d t5, a0, t5 991 fst.s f7, t5, 0 992 fstx.s f8, t5, a1 993 alsl.d t5, a1, t5, 1 994 fst.s f9, t5, 0 995 fstx.s f10, t5, a1 996 alsl.d t5, a1, t5, 1 997 fst.s f12, t5, 0 998 fstx.s f13, t5, a1 999.endif 1000.END_FILTER_\DIR\()\TYPE\()_W16: 1001.ifc \DIR, v 1002 addi.d sp, sp, 96 1003.endif 1004.endm 1005 1006.macro PUSH_REG 1007 addi.d sp, sp, -64 1008 fst.d f24, sp, 0 1009 fst.d f25, sp, 8 1010 fst.d f26, sp, 16 1011 fst.d f27, sp, 24 1012 fst.d f28, sp, 32 1013 fst.d f29, sp, 40 1014 fst.d f30, sp, 48 1015 fst.d f31, sp, 56 1016.endm 1017.macro POP_REG 1018 fld.d f24, sp, 0 1019 fld.d f25, sp, 8 1020 fld.d f26, sp, 16 1021 fld.d f27, sp, 24 1022 fld.d f28, sp, 32 1023 fld.d f29, sp, 40 1024 fld.d f30, sp, 48 1025 fld.d f31, sp, 56 1026 addi.d sp, sp, 64 1027.endm 1028 1029.macro LPF_FUNC DIR, TYPE 1030function lpf_\DIR\()_sb_\TYPE\()_8bpc_lsx 1031 PUSH_REG 1032 vld vr0, a2, 0 //vmask 1033 vpickve2gr.wu t0, vr0, 0 1034 vpickve2gr.wu t1, vr0, 1 1035 vpickve2gr.wu t2, vr0, 2 1036 li.w t3, 1 //y 1037 or t0, t0, t1 1038.ifc \TYPE, y 1039 or t0, t0, t2 //vm 1040.endif 1041 addi.w t8, t3, -1 1042 andn t8, t0, t8 1043 beqz t0, .\DIR\()\TYPE\()_END 1044.\DIR\()\TYPE\()_LOOP: 1045 and t4, t0, t3 //vm & y 1046 beqz t4, .\DIR\()\TYPE\()_LOOP_NEXT 1047 vldrepl.b vr1, a3, 0 //l[0][0] 1048.ifc \DIR, h 1049 addi.d t5, a3, -4 1050.else 1051 slli.d t5, a4, 2 1052 sub.d t5, a3, t5 1053.endif 1054 vldrepl.b vr2, t5, 0 //l[-1][0] 1055 vseqi.b vr3, vr1, 0 1056 vbitsel.v vr1, vr1, vr2, vr3 //L 1057 vpickve2gr.b t5, vr1, 0 1058 beqz t5, .\DIR\()\TYPE\()_LOOP_NEXT 1059 vsrai.b vr2, vr1, 4 //H 1060 add.d t6, a5, t5 1061 vldrepl.b vr3, t6, 0 //E 1062 addi.d t6, t6, 64 1063 vldrepl.b vr4, t6, 0 //I 1064.ifc \TYPE, y 1065 and t5, t2, t3 1066 bnez t5, .FILTER_\DIR\()\TYPE\()_16 1067.endif 1068 and t5, t1, t3 1069.ifc \TYPE, y 1070 bnez t5, .FILTER_\DIR\()\TYPE\()_8 1071.else 1072 bnez t5, .FILTER_\DIR\()\TYPE\()_6 1073.endif 1074 FILTER_W4 \DIR, \TYPE 1075 b .\DIR\()\TYPE\()_LOOP_NEXT 1076.ifc \TYPE, uv 1077.FILTER_\DIR\()\TYPE\()_6: 1078 FILTER_W6 \DIR, \TYPE 1079.endif 1080.ifc \TYPE, y 1081.FILTER_\DIR\()\TYPE\()_8: 1082 FILTER_W8 \DIR, \TYPE 1083 b .\DIR\()\TYPE\()_LOOP_NEXT 1084.FILTER_\DIR\()\TYPE\()_16: 1085 FILTER_W16 \DIR, \TYPE 1086.endif 1087.\DIR\()\TYPE\()_LOOP_NEXT: 1088 slli.w t3, t3, 1 1089.ifc \DIR, h 1090 alsl.d a0, a1, a0, 2 1091 slli.w t8, a4, 2 1092 add.d a3, a3, t8 1093.else 1094 addi.d a0, a0, 4 1095 addi.d a3, a3, 4 1096.endif 1097 addi.w t8, t3, -1 1098 andn t8, t0, t8 1099 bnez t8, .\DIR\()\TYPE\()_LOOP 1100.\DIR\()\TYPE\()_END: 1101 POP_REG 1102endfunc 1103.endm 1104 1105LPF_FUNC h, y 1106LPF_FUNC v, y 1107LPF_FUNC h, uv 1108LPF_FUNC v, uv 1109