1/* 2 * Copyright © 2023, VideoLAN and dav1d authors 3 * Copyright © 2023, Loongson Technology Corporation Limited 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/loongarch/loongson_asm.S" 29 30/* 31static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride, 32 const pixel *src, const ptrdiff_t src_stride, 33 const int16_t *const abcd, int mx, int my 34 HIGHBD_DECL_SUFFIX) 35*/ 36.macro FILTER_WARP_RND_P_LSX in0, in1, in2, in3, out0, out1, out2, out3 37 vbsrl.v vr2, \in0, \in1 38 vbsrl.v vr20, \in0, \in2 39 addi.w t4, \in3, 512 40 srai.w t4, t4, 10 41 addi.w t4, t4, 64 42 slli.w t4, t4, 3 43 vldx vr1, t5, t4 44 add.w t3, t3, t0 // tmx += abcd[0] 45 46 addi.w t4, t3, 512 47 srai.w t4, t4, 10 48 addi.w t4, t4, 64 49 slli.w t4, t4, 3 50 vldx vr29, t5, t4 51 add.w t3, t3, t0 // tmx += abcd[0] 52 53 vilvl.d vr2, vr20, vr2 54 vilvl.d vr1, vr29, vr1 55 vmulwev.h.bu.b vr3, vr2, vr1 56 vmulwod.h.bu.b vr20, vr2, vr1 57 vilvl.d vr2, vr20, vr3 58 vhaddw.w.h vr2, vr2, vr2 59 vhaddw.d.w vr2, vr2, vr2 60 vhaddw.q.d vr2, vr2, vr2 61 vilvh.d vr3, vr20, vr3 62 vhaddw.w.h vr3, vr3, vr3 63 vhaddw.d.w vr3, vr3, vr3 64 vhaddw.q.d vr3, vr3, vr3 65 vextrins.w \out0, vr2, \out1 66 vextrins.w \out2, vr3, \out3 67.endm 68 69.macro FILTER_WARP_CLIP_LSX in0, in1, in2, out0, out1 70 add.w \in0, \in0, \in1 71 addi.w t6, \in0, 512 72 srai.w t6, t6, 10 73 addi.w t6, t6, 64 74 slli.w t6, t6, 3 75 fldx.d f1, t5, t6 76 vsllwil.h.b vr1, vr1, 0 77 vmulwev.w.h vr3, \in2, vr1 78 vmaddwod.w.h vr3, \in2, vr1 79 vhaddw.d.w vr3, vr3, vr3 80 vhaddw.q.d vr3, vr3, vr3 81 vextrins.w \out0, vr3, \out1 82.endm 83 84const warp_sh 85.rept 2 86.byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 87.endr 88.rept 2 89.byte 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 90.endr 91endconst 92 93.macro warp_lsx t, shift 94function warp_affine_8x8\t\()_8bpc_lsx 95 addi.d sp, sp, -64 96 fst.d f24, sp, 0 97 fst.d f25, sp, 8 98 fst.d f26, sp, 16 99 fst.d f27, sp, 24 100 fst.d f28, sp, 32 101 fst.d f29, sp, 40 102 fst.d f30, sp, 48 103 fst.d f31, sp, 56 104 105 la.local t4, warp_sh 106 ld.h t0, a4, 0 // abcd[0] 107 ld.h t1, a4, 2 // abcd[1] 108 109 alsl.w t2, a3, a3, 1 110 addi.w t3, a5, 0 111 la.local t5, dav1d_mc_warp_filter 112 sub.d a2, a2, t2 113 addi.d a2, a2, -3 114 vld vr0, a2, 0 115 vld vr30, t4, 0 116 vld vr31, t4, 32 117 118 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00 119 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00 120 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00 121 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00 122 123 add.w a5, a5, t1 124 or t3, a5, a5 125 add.d a2, a2, a3 126 vld vr0, a2, 0 127 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10 128 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10 129 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10 130 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10 131 132 add.w a5, a5, t1 133 or t3, a5, a5 134 add.d a2, a2, a3 135 vld vr0, a2, 0 136 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20 137 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20 138 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20 139 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20 140 141 add.w a5, a5, t1 142 or t3, a5, a5 143 add.d a2, a2, a3 144 vld vr0, a2, 0 145 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30 146 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30 147 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30 148 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30 149 150 add.w a5, t1, a5 151 or t3, a5, a5 152 add.d a2, a2, a3 153 vld vr0, a2, 0 154 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x00, vr13, 0x00 155 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x00, vr15, 0x00 156 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x00, vr17, 0x00 157 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x00, vr19, 0x00 158 159 add.w a5, a5, t1 160 or t3, a5, a5 161 add.d a2, a2, a3 162 vld vr0, a2, 0 163 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x10, vr13, 0x10 164 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x10, vr15, 0x10 165 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x10, vr17, 0x10 166 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x10, vr19, 0x10 167 168 add.w a5, a5, t1 169 or t3, a5, a5 170 add.d a2, a2, a3 171 vld vr0, a2, 0 172 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x20, vr13, 0x20 173 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x20, vr15, 0x20 174 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x20, vr17, 0x20 175 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x20, vr19, 0x20 176 177 add.w a5, a5, t1 178 or t3, a5, a5 179 add.d a2, a2, a3 180 vld vr0, a2, 0 181 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x30, vr13, 0x30 182 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x30, vr15, 0x30 183 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x30, vr17, 0x30 184 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x30, vr19, 0x30 185 186 vsrarni.h.w vr12, vr4, 3 187 vsrarni.h.w vr13, vr5, 3 188 vsrarni.h.w vr14, vr6, 3 189 vsrarni.h.w vr15, vr7, 3 190 vsrarni.h.w vr16, vr8, 3 191 vsrarni.h.w vr17, vr9, 3 192 vsrarni.h.w vr18, vr10, 3 193 vsrarni.h.w vr19, vr11, 3 194 195 add.w a5, a5, t1 196 or t3, a5, a5 197 add.d a2, a2, a3 198 vld vr0, a2, 0 199 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00 200 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00 201 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00 202 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00 203 204 add.w a5, a5, t1 205 or t3, a5, a5 206 add.d a2, a2, a3 207 vld vr0, a2, 0 208 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10 209 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10 210 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10 211 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10 212 213 add.w a5, a5, t1 214 or t3, a5, a5 215 add.d a2, a2, a3 216 vld vr0, a2, 0 217 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20 218 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20 219 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20 220 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20 221 222 add.w a5, a5, t1 223 or t3, a5, a5 224 add.d a2, a2, a3 225 vld vr0, a2, 0 226 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30 227 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30 228 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30 229 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30 230 231 add.w a5, a5, t1 232 or t3, a5, a5 233 add.d a2, a2, a3 234 vld vr0, a2, 0 235 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x00, vr22, 0x00 236 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x00, vr24, 0x00 237 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x00, vr26, 0x00 238 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x00, vr28, 0x00 239 240 add.w a5, a5, t1 241 or t3, a5, a5 242 add.d a2, a2, a3 243 vld vr0, a2, 0 244 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x10, vr22, 0x10 245 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x10, vr24, 0x10 246 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x10, vr26, 0x10 247 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x10, vr28, 0x10 248 249 add.w a5, a5, t1 250 or t3, a5, a5 251 add.d a2, a2, a3 252 vld vr0, a2, 0 253 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x20, vr22, 0x20 254 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x20, vr24, 0x20 255 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x20, vr26, 0x20 256 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x20, vr28, 0x20 257 258 vsrarni.h.w vr21, vr4, 3 259 vsrarni.h.w vr22, vr5, 3 260 vsrarni.h.w vr23, vr6, 3 261 vsrarni.h.w vr24, vr7, 3 262 vsrarni.h.w vr25, vr8, 3 263 vsrarni.h.w vr26, vr9, 3 264 vsrarni.h.w vr27, vr10, 3 265 vsrarni.h.w vr28, vr11, 3 266 267 addi.w t2, a6, 0 // my 268 ld.h t7, a4, 4 // abcd[2] 269 ld.h t8, a4, 6 // abcd[3] 270 271.ifnb \t 272 slli.d a1, a1, 1 273.endif 274 275 FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 276 FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 277 FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 278 FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 279 FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 280 FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 281 FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 282 FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 283.ifnb \t 284 vssrarni.h.w vr5, vr4, \shift 285 vst vr5, a0, 0 286.else 287 vssrarni.hu.w vr5, vr4, \shift 288 vssrlni.bu.h vr5, vr5, 0 289 fst.d f5, a0, 0 290.endif 291 292 vshuf.b vr12, vr21, vr12, vr30 293 vshuf.b vr13, vr22, vr13, vr30 294 vshuf.b vr14, vr23, vr14, vr30 295 vshuf.b vr15, vr24, vr15, vr30 296 vshuf.b vr16, vr25, vr16, vr30 297 vshuf.b vr17, vr26, vr17, vr30 298 vshuf.b vr18, vr27, vr18, vr30 299 vshuf.b vr19, vr28, vr19, vr30 300 vextrins.h vr30, vr31, 0x70 301 302 add.w a6, a6, t8 303 addi.w t2, a6, 0 304 FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 305 FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 306 FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 307 FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 308 FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 309 FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 310 FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 311 FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 312.ifnb \t 313 vssrarni.h.w vr5, vr4, \shift 314 vstx vr5, a0, a1 315.else 316 vssrarni.hu.w vr5, vr4, \shift 317 vssrlni.bu.h vr5, vr5, 0 318 fstx.d f5, a0, a1 319.endif 320 321 vaddi.bu vr31, vr31, 2 322 vshuf.b vr12, vr21, vr12, vr30 323 vshuf.b vr13, vr22, vr13, vr30 324 vshuf.b vr14, vr23, vr14, vr30 325 vshuf.b vr15, vr24, vr15, vr30 326 vshuf.b vr16, vr25, vr16, vr30 327 vshuf.b vr17, vr26, vr17, vr30 328 vshuf.b vr18, vr27, vr18, vr30 329 vshuf.b vr19, vr28, vr19, vr30 330 vextrins.h vr30, vr31, 0x70 331 332 add.w a6, a6, t8 333 addi.w t2, a6, 0 334 FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 335 FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 336 FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 337 FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 338 FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 339 FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 340 FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 341 FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 342 alsl.d a0, a1, a0, 1 343.ifnb \t 344 vssrarni.h.w vr5, vr4, \shift 345 vst vr5, a0, 0 346.else 347 vssrarni.hu.w vr5, vr4, \shift 348 vssrlni.bu.h vr5, vr5, 0 349 fst.d f5, a0, 0 350.endif 351 352 vaddi.bu vr31, vr31, 2 353 vshuf.b vr12, vr21, vr12, vr30 354 vshuf.b vr13, vr22, vr13, vr30 355 vshuf.b vr14, vr23, vr14, vr30 356 vshuf.b vr15, vr24, vr15, vr30 357 vshuf.b vr16, vr25, vr16, vr30 358 vshuf.b vr17, vr26, vr17, vr30 359 vshuf.b vr18, vr27, vr18, vr30 360 vshuf.b vr19, vr28, vr19, vr30 361 vextrins.h vr30, vr31, 0x70 362 363 add.w a6, a6, t8 364 addi.w t2, a6, 0 365 FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 366 FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 367 FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 368 FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 369 FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 370 FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 371 FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 372 FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 373.ifnb \t 374 vssrarni.h.w vr5, vr4, \shift 375 vstx vr5, a0, a1 376.else 377 vssrarni.hu.w vr5, vr4, \shift 378 vssrlni.bu.h vr5, vr5, 0 379 fstx.d f5, a0, a1 380.endif 381 382 vaddi.bu vr31, vr31, 2 383 vshuf.b vr12, vr21, vr12, vr30 384 vshuf.b vr13, vr22, vr13, vr30 385 vshuf.b vr14, vr23, vr14, vr30 386 vshuf.b vr15, vr24, vr15, vr30 387 vshuf.b vr16, vr25, vr16, vr30 388 vshuf.b vr17, vr26, vr17, vr30 389 vshuf.b vr18, vr27, vr18, vr30 390 vshuf.b vr19, vr28, vr19, vr30 391 vextrins.h vr30, vr31, 0x70 392 393 add.w a6, a6, t8 394 addi.w t2, a6, 0 395 FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 396 FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 397 FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 398 FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 399 FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 400 FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 401 FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 402 FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 403 alsl.d a0, a1, a0, 1 404.ifnb \t 405 vssrarni.h.w vr5, vr4, \shift 406 vst vr5, a0, 0 407.else 408 vssrarni.hu.w vr5, vr4, \shift 409 vssrlni.bu.h vr5, vr5, 0 410 fst.d f5, a0, 0 411.endif 412 413 vaddi.bu vr31, vr31, 2 414 vshuf.b vr12, vr21, vr12, vr30 415 vshuf.b vr13, vr22, vr13, vr30 416 vshuf.b vr14, vr23, vr14, vr30 417 vshuf.b vr15, vr24, vr15, vr30 418 vshuf.b vr16, vr25, vr16, vr30 419 vshuf.b vr17, vr26, vr17, vr30 420 vshuf.b vr18, vr27, vr18, vr30 421 vshuf.b vr19, vr28, vr19, vr30 422 vextrins.h vr30, vr31, 0x70 423 424 add.w a6, a6, t8 425 addi.w t2, a6, 0 426 FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 427 FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 428 FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 429 FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 430 FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 431 FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 432 FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 433 FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 434.ifnb \t 435 vssrarni.h.w vr5, vr4, \shift 436 vstx vr5, a0, a1 437.else 438 vssrarni.hu.w vr5, vr4, \shift 439 vssrlni.bu.h vr5, vr5, 0 440 fstx.d f5, a0, a1 441.endif 442 443 vaddi.bu vr31, vr31, 2 444 vshuf.b vr12, vr21, vr12, vr30 445 vshuf.b vr13, vr22, vr13, vr30 446 vshuf.b vr14, vr23, vr14, vr30 447 vshuf.b vr15, vr24, vr15, vr30 448 vshuf.b vr16, vr25, vr16, vr30 449 vshuf.b vr17, vr26, vr17, vr30 450 vshuf.b vr18, vr27, vr18, vr30 451 vshuf.b vr19, vr28, vr19, vr30 452 vextrins.h vr30, vr31, 0x70 453 454 add.w a6, a6, t8 455 addi.w t2, a6, 0 456 FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 457 FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 458 FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 459 FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 460 FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 461 FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 462 FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 463 FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 464 alsl.d a0, a1, a0, 1 465.ifnb \t 466 vssrarni.h.w vr5, vr4, \shift 467 vst vr5, a0, 0 468.else 469 vssrarni.hu.w vr5, vr4, \shift 470 vssrlni.bu.h vr5, vr5, 0 471 fst.d f5, a0, 0 472.endif 473 474 vshuf.b vr12, vr21, vr12, vr30 475 vshuf.b vr13, vr22, vr13, vr30 476 vshuf.b vr14, vr23, vr14, vr30 477 vshuf.b vr15, vr24, vr15, vr30 478 vshuf.b vr16, vr25, vr16, vr30 479 vshuf.b vr17, vr26, vr17, vr30 480 vshuf.b vr18, vr27, vr18, vr30 481 vshuf.b vr19, vr28, vr19, vr30 482 483 add.w a6, a6, t8 484 addi.w t2, a6, 0 485 FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 486 FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 487 FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 488 FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 489 FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 490 FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 491 FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 492 FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 493.ifnb \t 494 vssrarni.h.w vr5, vr4, \shift 495 vstx vr5, a0, a1 496.else 497 vssrarni.hu.w vr5, vr4, \shift 498 vssrlni.bu.h vr5, vr5, 0 499 fstx.d f5, a0, a1 500.endif 501 502 fld.d f24, sp, 0 503 fld.d f25, sp, 8 504 fld.d f26, sp, 16 505 fld.d f27, sp, 24 506 fld.d f28, sp, 32 507 fld.d f29, sp, 40 508 fld.d f30, sp, 48 509 fld.d f31, sp, 56 510 addi.d sp, sp, 64 511endfunc 512.endm 513 514warp_lsx , 11 515warp_lsx t, 7 516 517.macro FILTER_WARP_RND_P_LASX in0, in1, in2, out0, out1, out2, out3 518 xvshuf.b xr2, \in0, \in0, \in2 519 520 addi.w t4, \in1, 512 521 srai.w t4, t4, 10 522 addi.w t4, t4, 64 523 slli.w t4, t4, 3 524 vldx vr3, t5, t4 525 add.w t3, t3, t0 // tmx += abcd[0] 526 527 addi.w t4, t3, 512 528 srai.w t4, t4, 10 529 addi.w t4, t4, 64 530 slli.w t4, t4, 3 531 vldx vr4, t5, t4 532 add.w t3, t3, t0 // tmx += abcd[0] 533 534 addi.w t4, t3, 512 535 srai.w t4, t4, 10 536 addi.w t4, t4, 64 537 slli.w t4, t4, 3 538 vldx vr5, t5, t4 539 add.w t3, t3, t0 // tmx += abcd[0] 540 541 addi.w t4, t3, 512 542 srai.w t4, t4, 10 543 addi.w t4, t4, 64 544 slli.w t4, t4, 3 545 vldx vr6, t5, t4 546 add.w t3, t3, t0 // tmx += abcd[0] 547 548 xvinsve0.d xr3, xr5, 1 549 xvinsve0.d xr3, xr4, 2 550 xvinsve0.d xr3, xr6, 3 551 552 xvmulwev.h.bu.b xr4, xr2, xr3 553 xvmulwod.h.bu.b xr5, xr2, xr3 554 xvilvl.d xr2, xr5, xr4 555 xvilvh.d xr3, xr5, xr4 556 xvhaddw.w.h xr2, xr2, xr2 557 xvhaddw.w.h xr3, xr3, xr3 558 xvhaddw.d.w xr2, xr2, xr2 559 xvhaddw.d.w xr3, xr3, xr3 560 xvhaddw.q.d xr2, xr2, xr2 561 xvhaddw.q.d xr3, xr3, xr3 562 563 xvextrins.w \out0, xr2, \out1 564 xvextrins.w \out2, xr3, \out3 565.endm 566 567.macro FILTER_WARP_CLIP_LASX in0, in1, in2, out0, out1 568 add.w \in0, \in0, \in1 569 addi.w t6, \in0, 512 570 srai.w t6, t6, 10 571 addi.w t6, t6, 64 572 slli.w t6, t6, 3 573 fldx.d f1, t5, t6 574 575 add.w t2, t2, t7 576 addi.w t6, t2, 512 577 srai.w t6, t6, 10 578 addi.w t6, t6, 64 579 slli.w t6, t6, 3 580 fldx.d f2, t5, t6 581 582 vilvl.d vr0, vr2, vr1 583 vext2xv.h.b xr0, xr0 584 xvmulwev.w.h xr3, \in2, xr0 585 xvmaddwod.w.h xr3, \in2, xr0 586 xvhaddw.d.w xr3, xr3, xr3 587 xvhaddw.q.d xr3, xr3, xr3 588 xvextrins.w \out0, xr3, \out1 589.endm 590 591const shuf0 592.byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 593.byte 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10 594endconst 595 596.macro warp_lasx t, shift 597function warp_affine_8x8\t\()_8bpc_lasx 598 addi.d sp, sp, -16 599 ld.h t0, a4, 0 // abcd[0] 600 ld.h t1, a4, 2 // abcd[1] 601 fst.d f24, sp, 0 602 fst.d f25, sp, 8 603 604 alsl.w t2, a3, a3, 1 605 addi.w t3, a5, 0 606 la.local t4, warp_sh 607 la.local t5, dav1d_mc_warp_filter 608 sub.d a2, a2, t2 609 addi.d a2, a2, -3 610 vld vr0, a2, 0 611 xvld xr24, t4, 0 612 xvld xr25, t4, 32 613 la.local t2, shuf0 614 xvld xr1, t2, 0 615 xvpermi.q xr0, xr0, 0x00 616 xvaddi.bu xr9, xr1, 4 617 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00 618 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00 619 620 add.w a5, a5, t1 621 or t3, a5, a5 622 add.d a2, a2, a3 623 vld vr0, a2, 0 624 xvpermi.q xr0, xr0, 0x00 625 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10 626 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10 627 628 add.w a5, a5, t1 629 or t3, a5, a5 630 add.d a2, a2, a3 631 vld vr0, a2, 0 632 xvpermi.q xr0, xr0, 0x00 633 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20 634 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20 635 636 add.w a5, a5, t1 637 or t3, a5, a5 638 add.d a2, a2, a3 639 vld vr0, a2, 0 640 xvpermi.q xr0, xr0, 0x00 641 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30 642 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30 643 644 add.w a5, a5, t1 645 or t3, a5, a5 646 add.d a2, a2, a3 647 vld vr0, a2, 0 648 xvpermi.q xr0, xr0, 0x00 649 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x00, xr13, 0x00 650 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x00, xr15, 0x00 651 652 add.w a5, a5, t1 653 or t3, a5, a5 654 add.d a2, a2, a3 655 vld vr0, a2, 0 656 xvpermi.q xr0, xr0, 0x00 657 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x10, xr13, 0x10 658 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x10, xr15, 0x10 659 660 add.w a5, a5, t1 661 or t3, a5, a5 662 add.d a2, a2, a3 663 vld vr0, a2, 0 664 xvpermi.q xr0, xr0, 0x00 665 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x20, xr13, 0x20 666 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x20, xr15, 0x20 667 668 add.w a5, a5, t1 669 or t3, a5, a5 670 add.d a2, a2, a3 671 vld vr0, a2, 0 672 xvpermi.q xr0, xr0, 0x00 673 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x30, xr13, 0x30 674 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x30, xr15, 0x30 675 676 xvsrarni.h.w xr12, xr7, 3 677 xvsrarni.h.w xr13, xr8, 3 678 xvsrarni.h.w xr14, xr10, 3 679 xvsrarni.h.w xr15, xr11, 3 680 681 add.w a5, a5, t1 682 or t3, a5, a5 683 add.d a2, a2, a3 684 vld vr0, a2, 0 685 xvpermi.q xr0, xr0, 0x00 686 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00 687 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00 688 689 add.w a5, a5, t1 690 or t3, a5, a5 691 add.d a2, a2, a3 692 vld vr0, a2, 0 693 xvpermi.q xr0, xr0, 0x00 694 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10 695 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10 696 697 add.w a5, a5, t1 698 or t3, a5, a5 699 add.d a2, a2, a3 700 vld vr0, a2, 0 701 xvpermi.q xr0, xr0, 0x00 702 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20 703 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20 704 705 add.w a5, a5, t1 706 or t3, a5, a5 707 add.d a2, a2, a3 708 vld vr0, a2, 0 709 xvpermi.q xr0, xr0, 0x00 710 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30 711 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30 712 713 add.w a5, a5, t1 714 or t3, a5, a5 715 add.d a2, a2, a3 716 vld vr0, a2, 0 717 xvpermi.q xr0, xr0, 0x00 718 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x00, xr17, 0x00 719 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x00, xr19, 0x00 720 721 add.w a5, a5, t1 722 or t3, a5, a5 723 add.d a2, a2, a3 724 vld vr0, a2, 0 725 xvpermi.q xr0, xr0, 0x00 726 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x10, xr17, 0x10 727 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x10, xr19, 0x10 728 729 add.w a5, a5, t1 730 or t3, a5, a5 731 add.d a2, a2, a3 732 vld vr0, a2, 0 733 xvpermi.q xr0, xr0, 0x00 734 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x20, xr17, 0x20 735 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x20, xr19, 0x20 736 737 xvsrarni.h.w xr16, xr7, 3 738 xvsrarni.h.w xr17, xr8, 3 739 xvsrarni.h.w xr18, xr10, 3 740 xvsrarni.h.w xr19, xr11, 3 741 742 addi.w t2, a6, 0 // my 743 ld.h t7, a4, 4 // abcd[2] 744 ld.h t8, a4, 6 // abcd[3] 745 746.ifnb \t 747 slli.d a1, a1, 1 748.endif 749 750 // y = 0 751 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 752 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 753 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 754 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 755 756 xvshuf.b xr12, xr16, xr12, xr24 757 xvshuf.b xr13, xr17, xr13, xr24 758 xvshuf.b xr14, xr18, xr14, xr24 759 xvshuf.b xr15, xr19, xr15, xr24 760 xvextrins.h xr24, xr25, 0x70 761 762 add.w a6, a6, t8 763 addi.w t2, a6, 0 764 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 765 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 766 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 767 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 768 769.ifnb \t 770 xvssrarni.h.w xr21, xr20, \shift 771 xvpermi.q xr22, xr21, 0x01 772 vilvl.h vr23, vr22, vr21 773 vilvh.h vr21, vr22, vr21 774 vst vr23, a0, 0 775 vstx vr21, a0, a1 776.else 777 xvssrarni.hu.w xr21, xr20, \shift 778 xvssrlni.bu.h xr22, xr21, 0 779 xvpermi.q xr23, xr22, 0x01 780 vilvl.b vr21, vr23, vr22 781 fst.d f21, a0, 0 782 add.d a0, a0, a1 783 vstelm.d vr21, a0, 0, 1 784.endif 785 786 xvaddi.bu xr25, xr25, 2 787 xvshuf.b xr12, xr16, xr12, xr24 788 xvshuf.b xr13, xr17, xr13, xr24 789 xvshuf.b xr14, xr18, xr14, xr24 790 xvshuf.b xr15, xr19, xr15, xr24 791 xvextrins.h xr24, xr25, 0x70 792 793 add.w a6, a6, t8 794 addi.w t2, a6, 0 795 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 796 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 797 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 798 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 799 800 xvaddi.bu xr25, xr25, 2 801 xvshuf.b xr12, xr16, xr12, xr24 802 xvshuf.b xr13, xr17, xr13, xr24 803 xvshuf.b xr14, xr18, xr14, xr24 804 xvshuf.b xr15, xr19, xr15, xr24 805 xvextrins.h xr24, xr25, 0x70 806 807 add.w a6, a6, t8 808 addi.w t2, a6, 0 809 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 810 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 811 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 812 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 813 814.ifnb \t 815 xvssrarni.h.w xr21, xr20, \shift 816 alsl.d a0, a1, a0, 1 817 xvpermi.q xr22, xr21, 0x01 818 vilvl.h vr23, vr22, vr21 819 vilvh.h vr21, vr22, vr21 820 vst vr23, a0, 0 821 vstx vr21, a0, a1 822.else 823 xvssrarni.hu.w xr21, xr20, 11 824 xvssrlni.bu.h xr22, xr21, 0 825 xvpermi.q xr23, xr22, 0x01 826 vilvl.b vr21, vr23, vr22 827 add.d a0, a0, a1 828 fst.d f21, a0, 0 829 add.d a0, a0, a1 830 vstelm.d vr21, a0, 0, 1 831.endif 832 833 xvaddi.bu xr25, xr25, 2 834 xvshuf.b xr12, xr16, xr12, xr24 835 xvshuf.b xr13, xr17, xr13, xr24 836 xvshuf.b xr14, xr18, xr14, xr24 837 xvshuf.b xr15, xr19, xr15, xr24 838 xvextrins.h xr24, xr25, 0x70 839 840 add.w a6, a6, t8 841 addi.w t2, a6, 0 842 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 843 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 844 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 845 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 846 847 xvaddi.bu xr25, xr25, 2 848 xvshuf.b xr12, xr16, xr12, xr24 849 xvshuf.b xr13, xr17, xr13, xr24 850 xvshuf.b xr14, xr18, xr14, xr24 851 xvshuf.b xr15, xr19, xr15, xr24 852 xvextrins.h xr24, xr25, 0x70 853 854 add.w a6, a6, t8 855 addi.w t2, a6, 0 856 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 857 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 858 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 859 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 860 861.ifnb \t 862 xvssrarni.h.w xr21, xr20, \shift 863 alsl.d a0, a1, a0, 1 864 xvpermi.q xr22, xr21, 0x01 865 vilvl.h vr23, vr22, vr21 866 vilvh.h vr21, vr22, vr21 867 vst vr23, a0, 0 868 vstx vr21, a0, a1 869.else 870 xvssrarni.hu.w xr21, xr20, 11 871 xvssrlni.bu.h xr22, xr21, 0 872 xvpermi.q xr23, xr22, 0x01 873 vilvl.b vr21, vr23, vr22 874 add.d a0, a0, a1 875 fst.d f21, a0, 0 876 add.d a0, a0, a1 877 vstelm.d vr21, a0, 0, 1 878.endif 879 880 xvaddi.bu xr25, xr25, 2 881 xvshuf.b xr12, xr16, xr12, xr24 882 xvshuf.b xr13, xr17, xr13, xr24 883 xvshuf.b xr14, xr18, xr14, xr24 884 xvshuf.b xr15, xr19, xr15, xr24 885 xvextrins.h xr24, xr25, 0x70 886 887 add.w a6, a6, t8 888 addi.w t2, a6, 0 889 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 890 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 891 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 892 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 893 894 xvshuf.b xr12, xr16, xr12, xr24 895 xvshuf.b xr13, xr17, xr13, xr24 896 xvshuf.b xr14, xr18, xr14, xr24 897 xvshuf.b xr15, xr19, xr15, xr24 898 899 add.w a6, a6, t8 900 addi.w t2, a6, 0 901 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 902 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 903 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 904 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 905 906.ifnb \t 907 xvssrarni.h.w xr21, xr20, \shift 908 alsl.d a0, a1, a0, 1 909 xvpermi.q xr22, xr21, 0x01 910 vilvl.h vr23, vr22, vr21 911 vilvh.h vr21, vr22, vr21 912 vst vr23, a0, 0 913 vstx vr21, a0, a1 914.else 915 xvssrarni.hu.w xr21, xr20, 11 916 xvssrlni.bu.h xr22, xr21, 0 917 xvpermi.q xr23, xr22, 0x01 918 vilvl.b vr21, vr23, vr22 919 add.d a0, a0, a1 920 fst.d f21, a0, 0 921 add.d a0, a0, a1 922 vstelm.d vr21, a0, 0, 1 923.endif 924 fld.d f24, sp, 0 925 fld.d f25, sp, 8 926 addi.d sp, sp, 16 927endfunc 928.endm 929 930warp_lasx , 11 931warp_lasx t, 7 932 933/* 934static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride, 935 const int16_t *tmp1, const int16_t *tmp2, 936 const int w, int h, 937 const int weight HIGHBD_DECL_SUFFIX) 938*/ 939 940#define bpc8_sh 5 // sh = intermediate_bits + 1 941#define bpcw8_sh 8 // sh = intermediate_bits + 4 942 943#define bpc_sh bpc8_sh 944#define bpcw_sh bpcw8_sh 945 946function avg_8bpc_lsx 947 addi.d t8, a0, 0 948 949 clz.w t0, a4 950 li.w t1, 24 951 sub.w t0, t0, t1 952 la.local t1, .AVG_LSX_JRTABLE 953 alsl.d t0, t0, t1, 1 954 ld.h t2, t0, 0 // The jump addresses are relative to AVG_LSX_JRTABLE 955 add.d t1, t1, t2 // Get absolute address 956 jirl $r0, t1, 0 957 958 .align 3 959.AVG_LSX_JRTABLE: 960 .hword .AVG_W128_LSX - .AVG_LSX_JRTABLE 961 .hword .AVG_W64_LSX - .AVG_LSX_JRTABLE 962 .hword .AVG_W32_LSX - .AVG_LSX_JRTABLE 963 .hword .AVG_W16_LSX - .AVG_LSX_JRTABLE 964 .hword .AVG_W8_LSX - .AVG_LSX_JRTABLE 965 .hword .AVG_W4_LSX - .AVG_LSX_JRTABLE 966 967.AVG_W4_LSX: 968 vld vr0, a2, 0 969 vld vr1, a3, 0 970 vadd.h vr2, vr0, vr1 971 vssrarni.bu.h vr3, vr2, bpc_sh 972 vstelm.w vr3, a0, 0, 0 973 add.d a0, a0, a1 974 vstelm.w vr3, a0, 0, 1 975 addi.w a5, a5, -2 976 addi.d a2, a2, 16 977 addi.d a3, a3, 16 978 add.d a0, a0, a1 979 blt zero, a5, .AVG_W4_LSX 980 b .AVG_END_LSX 981 982.AVG_W8_LSX: 983 vld vr0, a2, 0 984 vld vr2, a2, 16 985 vld vr1, a3, 0 986 vld vr3, a3, 16 987 vadd.h vr4, vr0, vr1 988 vadd.h vr5, vr2, vr3 989 vssrarni.bu.h vr5, vr4, bpc_sh 990 addi.w a5, a5, -2 991 addi.d a2, a2, 32 992 vstelm.d vr5, a0, 0, 0 993 add.d a0, a0, a1 994 vstelm.d vr5, a0, 0, 1 995 addi.d a3, a3, 32 996 add.d a0, a0, a1 997 blt zero, a5, .AVG_W8_LSX 998 b .AVG_END_LSX 999 1000.AVG_W16_LSX: 1001 vld vr0, a2, 0 1002 vld vr2, a2, 16 1003 vld vr1, a3, 0 1004 vld vr3, a3, 16 1005 vadd.h vr4, vr0, vr1 1006 vadd.h vr5, vr2, vr3 1007 vssrarni.bu.h vr5, vr4, bpc_sh 1008 addi.w a5, a5, -1 1009 addi.d a2, a2, 32 1010 vst vr5, a0, 0 1011 addi.d a3, a3, 32 1012 add.d a0, a0, a1 1013 blt zero, a5, .AVG_W16_LSX 1014 b .AVG_END_LSX 1015 1016.AVG_W32_LSX: 1017 vld vr0, a2, 0 1018 vld vr2, a2, 16 1019 vld vr4, a2, 32 1020 vld vr6, a2, 48 1021 vld vr1, a3, 0 1022 vld vr3, a3, 16 1023 vld vr5, a3, 32 1024 vld vr7, a3, 48 1025 vadd.h vr0, vr0, vr1 1026 vadd.h vr2, vr2, vr3 1027 vadd.h vr4, vr4, vr5 1028 vadd.h vr6, vr6, vr7 1029 vssrarni.bu.h vr2, vr0, bpc_sh 1030 vssrarni.bu.h vr6, vr4, bpc_sh 1031 addi.w a5, a5, -1 1032 addi.d a2, a2, 64 1033 vst vr2, a0, 0 1034 vst vr6, a0, 16 1035 addi.d a3, a3, 64 1036 add.d a0, a0, a1 1037 blt zero, a5, .AVG_W32_LSX 1038 b .AVG_END_LSX 1039 1040.AVG_W64_LSX: 1041.rept 4 1042 vld vr0, a2, 0 1043 vld vr2, a2, 16 1044 vld vr1, a3, 0 1045 vld vr3, a3, 16 1046 vadd.h vr0, vr0, vr1 1047 vadd.h vr2, vr2, vr3 1048 vssrarni.bu.h vr2, vr0, bpc_sh 1049 addi.d a2, a2, 32 1050 addi.d a3, a3, 32 1051 vst vr2, a0, 0 1052 addi.d a0, a0, 16 1053.endr 1054 addi.w a5, a5, -1 1055 add.d t8, t8, a1 1056 add.d a0, t8, zero 1057 blt zero, a5, .AVG_W64_LSX 1058 b .AVG_END_LSX 1059 1060.AVG_W128_LSX: 1061.rept 8 1062 vld vr0, a2, 0 1063 vld vr2, a2, 16 1064 vld vr1, a3, 0 1065 vld vr3, a3, 16 1066 vadd.h vr0, vr0, vr1 1067 vadd.h vr2, vr2, vr3 1068 vssrarni.bu.h vr2, vr0, bpc_sh 1069 addi.d a2, a2, 32 1070 addi.d a3, a3, 32 1071 vst vr2, a0, 0 1072 addi.d a0, a0, 16 1073.endr 1074 addi.w a5, a5, -1 1075 add.d t8, t8, a1 1076 add.d a0, t8, zero 1077 blt zero, a5, .AVG_W128_LSX 1078.AVG_END_LSX: 1079endfunc 1080 1081function avg_8bpc_lasx 1082 clz.w t0, a4 1083 li.w t1, 24 1084 sub.w t0, t0, t1 1085 la.local t1, .AVG_LASX_JRTABLE 1086 alsl.d t0, t0, t1, 1 1087 ld.h t2, t0, 0 1088 add.d t1, t1, t2 1089 jirl $r0, t1, 0 1090 1091 .align 3 1092.AVG_LASX_JRTABLE: 1093 .hword .AVG_W128_LASX - .AVG_LASX_JRTABLE 1094 .hword .AVG_W64_LASX - .AVG_LASX_JRTABLE 1095 .hword .AVG_W32_LASX - .AVG_LASX_JRTABLE 1096 .hword .AVG_W16_LASX - .AVG_LASX_JRTABLE 1097 .hword .AVG_W8_LASX - .AVG_LASX_JRTABLE 1098 .hword .AVG_W4_LASX - .AVG_LASX_JRTABLE 1099 1100.AVG_W4_LASX: 1101 vld vr0, a2, 0 1102 vld vr1, a3, 0 1103 vadd.h vr0, vr0, vr1 1104 vssrarni.bu.h vr1, vr0, bpc_sh 1105 vstelm.w vr1, a0, 0, 0 1106 add.d a0, a0, a1 1107 vstelm.w vr1, a0, 0, 1 1108 addi.w a5, a5, -2 1109 addi.d a2, a2, 16 1110 addi.d a3, a3, 16 1111 add.d a0, a0, a1 1112 blt zero, a5, .AVG_W4_LASX 1113 b .AVG_END_LASX 1114.AVG_W8_LASX: 1115 xvld xr0, a2, 0 1116 xvld xr1, a3, 0 1117 xvadd.h xr2, xr0, xr1 1118 xvssrarni.bu.h xr1, xr2, bpc_sh 1119 xvstelm.d xr1, a0, 0, 0 1120 add.d a0, a0, a1 1121 xvstelm.d xr1, a0, 0, 2 1122 addi.w a5, a5, -2 1123 addi.d a2, a2, 32 1124 addi.d a3, a3, 32 1125 add.d a0, a1, a0 1126 blt zero, a5, .AVG_W8_LASX 1127 b .AVG_END_LASX 1128.AVG_W16_LASX: 1129 xvld xr0, a2, 0 1130 xvld xr2, a2, 32 1131 xvld xr1, a3, 0 1132 xvld xr3, a3, 32 1133 xvadd.h xr4, xr0, xr1 1134 xvadd.h xr5, xr2, xr3 1135 xvssrarni.bu.h xr5, xr4, bpc_sh 1136 xvpermi.d xr2, xr5, 0xd8 1137 xvpermi.d xr3, xr5, 0x8d 1138 vst vr2, a0, 0 1139 vstx vr3, a0, a1 1140 addi.w a5, a5, -2 1141 addi.d a2, a2, 64 1142 addi.d a3, a3, 64 1143 alsl.d a0, a1, a0, 1 1144 blt zero, a5, .AVG_W16_LASX 1145 b .AVG_END_LASX 1146.AVG_W32_LASX: 1147 xvld xr0, a2, 0 1148 xvld xr2, a2, 32 1149 xvld xr1, a3, 0 1150 xvld xr3, a3, 32 1151 xvadd.h xr4, xr0, xr1 1152 xvadd.h xr5, xr2, xr3 1153 xvssrarni.bu.h xr5, xr4, bpc_sh 1154 xvpermi.d xr6, xr5, 0xd8 1155 xvst xr6, a0, 0 1156 addi.w a5, a5, -1 1157 addi.d a2, a2, 64 1158 addi.d a3, a3, 64 1159 add.d a0, a0, a1 1160 blt zero, a5, .AVG_W32_LASX 1161 b .AVG_END_LASX 1162.AVG_W64_LASX: 1163 xvld xr0, a2, 0 1164 xvld xr2, a2, 32 1165 xvld xr4, a2, 64 1166 xvld xr6, a2, 96 1167 xvld xr1, a3, 0 1168 xvld xr3, a3, 32 1169 xvld xr5, a3, 64 1170 xvld xr7, a3, 96 1171 xvadd.h xr0, xr0, xr1 1172 xvadd.h xr2, xr2, xr3 1173 xvadd.h xr4, xr4, xr5 1174 xvadd.h xr6, xr6, xr7 1175 xvssrarni.bu.h xr2, xr0, bpc_sh 1176 xvssrarni.bu.h xr6, xr4, bpc_sh 1177 xvpermi.d xr1, xr2, 0xd8 1178 xvpermi.d xr3, xr6, 0xd8 1179 xvst xr1, a0, 0 1180 xvst xr3, a0, 32 1181 addi.w a5, a5, -1 1182 addi.d a2, a2, 128 1183 addi.d a3, a3, 128 1184 add.d a0, a0, a1 1185 blt zero, a5, .AVG_W64_LASX 1186 b .AVG_END_LASX 1187.AVG_W128_LASX: 1188 xvld xr0, a2, 0 1189 xvld xr2, a2, 32 1190 xvld xr4, a2, 64 1191 xvld xr6, a2, 96 1192 xvld xr8, a2, 128 1193 xvld xr10, a2, 160 1194 xvld xr12, a2, 192 1195 xvld xr14, a2, 224 1196 xvld xr1, a3, 0 1197 xvld xr3, a3, 32 1198 xvld xr5, a3, 64 1199 xvld xr7, a3, 96 1200 xvld xr9, a3, 128 1201 xvld xr11, a3, 160 1202 xvld xr13, a3, 192 1203 xvld xr15, a3, 224 1204 xvadd.h xr0, xr0, xr1 1205 xvadd.h xr2, xr2, xr3 1206 xvadd.h xr4, xr4, xr5 1207 xvadd.h xr6, xr6, xr7 1208 xvadd.h xr8, xr8, xr9 1209 xvadd.h xr10, xr10, xr11 1210 xvadd.h xr12, xr12, xr13 1211 xvadd.h xr14, xr14, xr15 1212 xvssrarni.bu.h xr2, xr0, bpc_sh 1213 xvssrarni.bu.h xr6, xr4, bpc_sh 1214 xvssrarni.bu.h xr10, xr8, bpc_sh 1215 xvssrarni.bu.h xr14, xr12, bpc_sh 1216 xvpermi.d xr1, xr2, 0xd8 1217 xvpermi.d xr3, xr6, 0xd8 1218 xvpermi.d xr5, xr10, 0xd8 1219 xvpermi.d xr7, xr14, 0xd8 1220 xvst xr1, a0, 0 1221 xvst xr3, a0, 32 1222 xvst xr5, a0, 64 1223 xvst xr7, a0, 96 1224 addi.w a5, a5, -1 1225 addi.d a2, a2, 256 1226 addi.d a3, a3, 256 1227 add.d a0, a0, a1 1228 blt zero, a5, .AVG_W128_LASX 1229.AVG_END_LASX: 1230endfunc 1231 1232function w_avg_8bpc_lsx 1233 addi.d t8, a0, 0 1234 li.w t2, 16 1235 sub.w t2, t2, a6 // 16 - weight 1236 vreplgr2vr.h vr21, a6 1237 vreplgr2vr.h vr22, t2 1238 1239 clz.w t0, a4 1240 li.w t1, 24 1241 sub.w t0, t0, t1 1242 la.local t1, .W_AVG_LSX_JRTABLE 1243 alsl.d t0, t0, t1, 1 1244 ld.h t2, t0, 0 1245 add.d t1, t1, t2 1246 jirl $r0, t1, 0 1247 1248 .align 3 1249.W_AVG_LSX_JRTABLE: 1250 .hword .W_AVG_W128_LSX - .W_AVG_LSX_JRTABLE 1251 .hword .W_AVG_W64_LSX - .W_AVG_LSX_JRTABLE 1252 .hword .W_AVG_W32_LSX - .W_AVG_LSX_JRTABLE 1253 .hword .W_AVG_W16_LSX - .W_AVG_LSX_JRTABLE 1254 .hword .W_AVG_W8_LSX - .W_AVG_LSX_JRTABLE 1255 .hword .W_AVG_W4_LSX - .W_AVG_LSX_JRTABLE 1256 1257.W_AVG_W4_LSX: 1258 vld vr0, a2, 0 1259 vld vr1, a3, 0 1260 vmulwev.w.h vr2, vr0, vr21 1261 vmulwod.w.h vr3, vr0, vr21 1262 vmaddwev.w.h vr2, vr1, vr22 1263 vmaddwod.w.h vr3, vr1, vr22 1264 vssrarni.hu.w vr3, vr2, bpcw_sh 1265 vssrlni.bu.h vr1, vr3, 0 1266 vpickod.w vr4, vr2, vr1 1267 vilvl.b vr0, vr4, vr1 1268 fst.s f0, a0, 0 1269 add.d a0, a0, a1 1270 vstelm.w vr0, a0, 0, 1 1271 addi.w a5, a5, -2 1272 addi.d a2, a2, 16 1273 addi.d a3, a3, 16 1274 add.d a0, a1, a0 1275 blt zero, a5, .W_AVG_W4_LSX 1276 b .W_AVG_END_LSX 1277.W_AVG_W8_LSX: 1278 vld vr0, a2, 0 1279 vld vr1, a3, 0 1280 vmulwev.w.h vr2, vr0, vr21 1281 vmulwod.w.h vr3, vr0, vr21 1282 vmaddwev.w.h vr2, vr1, vr22 1283 vmaddwod.w.h vr3, vr1, vr22 1284 vssrarni.hu.w vr3, vr2, bpcw_sh 1285 vssrlni.bu.h vr1, vr3, 0 1286 vpickod.w vr4, vr2, vr1 1287 vilvl.b vr0, vr4, vr1 1288 fst.d f0, a0, 0 1289 addi.w a5, a5, -1 1290 addi.d a2, a2, 16 1291 addi.d a3, a3, 16 1292 add.d a0, a0, a1 1293 blt zero, a5, .W_AVG_W8_LSX 1294 b .W_AVG_END_LSX 1295.W_AVG_W16_LSX: 1296 vld vr0, a2, 0 1297 vld vr2, a2, 16 1298 vld vr1, a3, 0 1299 vld vr3, a3, 16 1300 vmulwev.w.h vr4, vr0, vr21 1301 vmulwod.w.h vr5, vr0, vr21 1302 vmulwev.w.h vr6, vr2, vr21 1303 vmulwod.w.h vr7, vr2, vr21 1304 vmaddwev.w.h vr4, vr1, vr22 1305 vmaddwod.w.h vr5, vr1, vr22 1306 vmaddwev.w.h vr6, vr3, vr22 1307 vmaddwod.w.h vr7, vr3, vr22 1308 vssrarni.hu.w vr6, vr4, bpcw_sh 1309 vssrarni.hu.w vr7, vr5, bpcw_sh 1310 vssrlrni.bu.h vr7, vr6, 0 1311 vshuf4i.w vr8, vr7, 0x4E 1312 vilvl.b vr0, vr8, vr7 1313 vst vr0, a0, 0 1314 addi.w a5, a5, -1 1315 addi.d a2, a2, 32 1316 addi.d a3, a3, 32 1317 add.d a0, a0, a1 1318 blt zero, a5, .W_AVG_W16_LSX 1319 b .W_AVG_END_LSX 1320.W_AVG_W32_LSX: 1321.rept 2 1322 vld vr0, a2, 0 1323 vld vr2, a2, 16 1324 vld vr1, a3, 0 1325 vld vr3, a3, 16 1326 vmulwev.w.h vr4, vr0, vr21 1327 vmulwod.w.h vr5, vr0, vr21 1328 vmulwev.w.h vr6, vr2, vr21 1329 vmulwod.w.h vr7, vr2, vr21 1330 vmaddwev.w.h vr4, vr1, vr22 1331 vmaddwod.w.h vr5, vr1, vr22 1332 vmaddwev.w.h vr6, vr3, vr22 1333 vmaddwod.w.h vr7, vr3, vr22 1334 vssrarni.hu.w vr6, vr4, bpcw_sh 1335 vssrarni.hu.w vr7, vr5, bpcw_sh 1336 vssrlrni.bu.h vr7, vr6, 0 1337 vshuf4i.w vr8, vr7, 0x4E 1338 vilvl.b vr0, vr8, vr7 1339 vst vr0, a0, 0 1340 addi.d a2, a2, 32 1341 addi.d a3, a3, 32 1342 addi.d a0, a0, 16 1343.endr 1344 addi.w a5, a5, -1 1345 add.d t8, t8, a1 1346 add.d a0, t8, zero 1347 blt zero, a5, .W_AVG_W32_LSX 1348 b .W_AVG_END_LSX 1349 1350.W_AVG_W64_LSX: 1351.rept 4 1352 vld vr0, a2, 0 1353 vld vr2, a2, 16 1354 vld vr1, a3, 0 1355 vld vr3, a3, 16 1356 vmulwev.w.h vr4, vr0, vr21 1357 vmulwod.w.h vr5, vr0, vr21 1358 vmulwev.w.h vr6, vr2, vr21 1359 vmulwod.w.h vr7, vr2, vr21 1360 vmaddwev.w.h vr4, vr1, vr22 1361 vmaddwod.w.h vr5, vr1, vr22 1362 vmaddwev.w.h vr6, vr3, vr22 1363 vmaddwod.w.h vr7, vr3, vr22 1364 vssrarni.hu.w vr6, vr4, bpcw_sh 1365 vssrarni.hu.w vr7, vr5, bpcw_sh 1366 vssrlrni.bu.h vr7, vr6, 0 1367 vshuf4i.w vr8, vr7, 0x4E 1368 vilvl.b vr0, vr8, vr7 1369 vst vr0, a0, 0 1370 addi.d a2, a2, 32 1371 addi.d a3, a3, 32 1372 addi.d a0, a0, 16 1373.endr 1374 addi.w a5, a5, -1 1375 add.d t8, t8, a1 1376 add.d a0, t8, zero 1377 blt zero, a5, .W_AVG_W64_LSX 1378 b .W_AVG_END_LSX 1379 1380.W_AVG_W128_LSX: 1381.rept 8 1382 vld vr0, a2, 0 1383 vld vr2, a2, 16 1384 vld vr1, a3, 0 1385 vld vr3, a3, 16 1386 vmulwev.w.h vr4, vr0, vr21 1387 vmulwod.w.h vr5, vr0, vr21 1388 vmulwev.w.h vr6, vr2, vr21 1389 vmulwod.w.h vr7, vr2, vr21 1390 vmaddwev.w.h vr4, vr1, vr22 1391 vmaddwod.w.h vr5, vr1, vr22 1392 vmaddwev.w.h vr6, vr3, vr22 1393 vmaddwod.w.h vr7, vr3, vr22 1394 vssrarni.hu.w vr6, vr4, bpcw_sh 1395 vssrarni.hu.w vr7, vr5, bpcw_sh 1396 vssrlrni.bu.h vr7, vr6, 0 1397 vshuf4i.w vr8, vr7, 0x4E 1398 vilvl.b vr0, vr8, vr7 1399 vst vr0, a0, 0 1400 addi.d a2, a2, 32 1401 addi.d a3, a3, 32 1402 addi.d a0, a0, 16 1403.endr 1404 addi.w a5, a5, -1 1405 add.d t8, t8, a1 1406 add.d a0, t8, zero 1407 blt zero, a5, .W_AVG_W128_LSX 1408.W_AVG_END_LSX: 1409endfunc 1410 1411function w_avg_8bpc_lasx 1412 addi.d t8, a0, 0 1413 li.w t2, 16 1414 sub.w t2, t2, a6 // 16 - weight 1415 xvreplgr2vr.h xr21, a6 1416 xvreplgr2vr.h xr22, t2 1417 1418 clz.w t0, a4 1419 li.w t1, 24 1420 sub.w t0, t0, t1 1421 la.local t1, .W_AVG_LASX_JRTABLE 1422 alsl.d t0, t0, t1, 1 1423 ld.h t2, t0, 0 1424 add.d t1, t1, t2 1425 jirl $r0, t1, 0 1426 1427 .align 3 1428.W_AVG_LASX_JRTABLE: 1429 .hword .W_AVG_W128_LASX - .W_AVG_LASX_JRTABLE 1430 .hword .W_AVG_W64_LASX - .W_AVG_LASX_JRTABLE 1431 .hword .W_AVG_W32_LASX - .W_AVG_LASX_JRTABLE 1432 .hword .W_AVG_W16_LASX - .W_AVG_LASX_JRTABLE 1433 .hword .W_AVG_W8_LASX - .W_AVG_LASX_JRTABLE 1434 .hword .W_AVG_W4_LASX - .W_AVG_LASX_JRTABLE 1435 1436.W_AVG_W4_LASX: 1437 vld vr0, a2, 0 1438 vld vr1, a3, 0 1439 xvpermi.d xr2, xr0, 0xD8 1440 xvpermi.d xr3, xr1, 0xD8 1441 xvilvl.h xr4, xr3, xr2 1442 xvmulwev.w.h xr0, xr4, xr21 1443 xvmaddwod.w.h xr0, xr4, xr22 1444 xvssrarni.hu.w xr1, xr0, bpcw_sh 1445 xvssrlni.bu.h xr0, xr1, 0 1446 fst.s f0, a0, 0 1447 add.d a0, a0, a1 1448 xvstelm.w xr0, a0, 0, 4 1449 addi.w a5, a5, -2 1450 addi.d a2, a2, 16 1451 addi.d a3, a3, 16 1452 add.d a0, a1, a0 1453 blt zero, a5, .W_AVG_W4_LASX 1454 b .W_AVG_END_LASX 1455 1456.W_AVG_W8_LASX: 1457 xvld xr0, a2, 0 1458 xvld xr1, a3, 0 1459 xvmulwev.w.h xr2, xr0, xr21 1460 xvmulwod.w.h xr3, xr0, xr21 1461 xvmaddwev.w.h xr2, xr1, xr22 1462 xvmaddwod.w.h xr3, xr1, xr22 1463 xvssrarni.hu.w xr3, xr2, bpcw_sh 1464 xvssrlni.bu.h xr1, xr3, 0 1465 xvpickod.w xr4, xr2, xr1 1466 xvilvl.b xr0, xr4, xr1 1467 xvstelm.d xr0, a0, 0, 0 1468 add.d a0, a0, a1 1469 xvstelm.d xr0, a0, 0, 2 1470 addi.w a5, a5, -2 1471 addi.d a2, a2, 32 1472 addi.d a3, a3, 32 1473 add.d a0, a0, a1 1474 blt zero, a5, .W_AVG_W8_LASX 1475 b .W_AVG_END_LASX 1476 1477.W_AVG_W16_LASX: 1478 xvld xr0, a2, 0 1479 xvld xr1, a3, 0 1480 xvmulwev.w.h xr2, xr0, xr21 1481 xvmulwod.w.h xr3, xr0, xr21 1482 xvmaddwev.w.h xr2, xr1, xr22 1483 xvmaddwod.w.h xr3, xr1, xr22 1484 xvssrarni.hu.w xr3, xr2, bpcw_sh 1485 xvssrlni.bu.h xr1, xr3, 0 1486 xvpickod.w xr4, xr2, xr1 1487 xvilvl.b xr0, xr4, xr1 1488 xvpermi.d xr1, xr0, 0xD8 1489 vst vr1, a0, 0 1490 addi.w a5, a5, -1 1491 addi.d a2, a2, 32 1492 addi.d a3, a3, 32 1493 add.d a0, a0, a1 1494 blt zero, a5, .W_AVG_W16_LASX 1495 b .W_AVG_END_LSX 1496 1497.W_AVG_W32_LASX: 1498 xvld xr0, a2, 0 1499 xvld xr2, a2, 32 1500 xvld xr1, a3, 0 1501 xvld xr3, a3, 32 1502 xvmulwev.w.h xr4, xr0, xr21 1503 xvmulwod.w.h xr5, xr0, xr21 1504 xvmulwev.w.h xr6, xr2, xr21 1505 xvmulwod.w.h xr7, xr2, xr21 1506 xvmaddwev.w.h xr4, xr1, xr22 1507 xvmaddwod.w.h xr5, xr1, xr22 1508 xvmaddwev.w.h xr6, xr3, xr22 1509 xvmaddwod.w.h xr7, xr3, xr22 1510 xvssrarni.hu.w xr6, xr4, bpcw_sh 1511 xvssrarni.hu.w xr7, xr5, bpcw_sh 1512 xvssrlni.bu.h xr7, xr6, 0 1513 xvshuf4i.w xr8, xr7, 0x4E 1514 xvilvl.b xr9, xr8, xr7 1515 xvpermi.d xr0, xr9, 0xD8 1516 xvst xr0, a0, 0 1517 addi.w a5, a5, -1 1518 addi.d a2, a2, 64 1519 addi.d a3, a3, 64 1520 add.d a0, a0, a1 1521 blt zero, a5, .W_AVG_W32_LASX 1522 b .W_AVG_END_LASX 1523 1524.W_AVG_W64_LASX: 1525.rept 2 1526 xvld xr0, a2, 0 1527 xvld xr2, a2, 32 1528 xvld xr1, a3, 0 1529 xvld xr3, a3, 32 1530 xvmulwev.w.h xr4, xr0, xr21 1531 xvmulwod.w.h xr5, xr0, xr21 1532 xvmulwev.w.h xr6, xr2, xr21 1533 xvmulwod.w.h xr7, xr2, xr21 1534 xvmaddwev.w.h xr4, xr1, xr22 1535 xvmaddwod.w.h xr5, xr1, xr22 1536 xvmaddwev.w.h xr6, xr3, xr22 1537 xvmaddwod.w.h xr7, xr3, xr22 1538 xvssrarni.hu.w xr6, xr4, bpcw_sh 1539 xvssrarni.hu.w xr7, xr5, bpcw_sh 1540 xvssrlni.bu.h xr7, xr6, 0 1541 xvshuf4i.w xr8, xr7, 0x4E 1542 xvilvl.b xr9, xr8, xr7 1543 xvpermi.d xr0, xr9, 0xD8 1544 xvst xr0, a0, 0 1545 addi.d a2, a2, 64 1546 addi.d a3, a3, 64 1547 addi.d a0, a0, 32 1548.endr 1549 addi.w a5, a5, -1 1550 add.d t8, t8, a1 1551 add.d a0, t8, zero 1552 blt zero, a5, .W_AVG_W64_LASX 1553 b .W_AVG_END_LASX 1554 1555.W_AVG_W128_LASX: 1556.rept 4 1557 xvld xr0, a2, 0 1558 xvld xr2, a2, 32 1559 xvld xr1, a3, 0 1560 xvld xr3, a3, 32 1561 xvmulwev.w.h xr4, xr0, xr21 1562 xvmulwod.w.h xr5, xr0, xr21 1563 xvmulwev.w.h xr6, xr2, xr21 1564 xvmulwod.w.h xr7, xr2, xr21 1565 xvmaddwev.w.h xr4, xr1, xr22 1566 xvmaddwod.w.h xr5, xr1, xr22 1567 xvmaddwev.w.h xr6, xr3, xr22 1568 xvmaddwod.w.h xr7, xr3, xr22 1569 xvssrarni.hu.w xr6, xr4, bpcw_sh 1570 xvssrarni.hu.w xr7, xr5, bpcw_sh 1571 xvssrlni.bu.h xr7, xr6, 0 1572 xvshuf4i.w xr8, xr7, 0x4E 1573 xvilvl.b xr9, xr8, xr7 1574 xvpermi.d xr0, xr9, 0xD8 1575 xvst xr0, a0, 0 1576 addi.d a2, a2, 64 1577 addi.d a3, a3, 64 1578 addi.d a0, a0, 32 1579.endr 1580 1581 addi.w a5, a5, -1 1582 add.d t8, t8, a1 1583 add.d a0, t8, zero 1584 blt zero, a5, .W_AVG_W128_LASX 1585.W_AVG_END_LASX: 1586endfunc 1587 1588#undef bpc_sh 1589#undef bpcw_sh 1590 1591#define mask_sh 10 1592/* 1593static void mask_c(pixel *dst, const ptrdiff_t dst_stride, 1594 const int16_t *tmp1, const int16_t *tmp2, const int w, int h, 1595 const uint8_t *mask HIGHBD_DECL_SUFFIX) 1596*/ 1597function mask_8bpc_lsx 1598 vldi vr21, 0x440 // 64 1599 vxor.v vr19, vr19, vr19 1600 addi.d t8, a0, 0 1601 clz.w t0, a4 1602 li.w t1, 24 1603 sub.w t0, t0, t1 1604 la.local t1, .MASK_LSX_JRTABLE 1605 alsl.d t0, t0, t1, 1 1606 ld.h t2, t0, 0 1607 add.d t1, t1, t2 1608 jirl $r0, t1, 0 1609 1610 .align 3 1611.MASK_LSX_JRTABLE: 1612 .hword .MASK_W128_LSX - .MASK_LSX_JRTABLE 1613 .hword .MASK_W64_LSX - .MASK_LSX_JRTABLE 1614 .hword .MASK_W32_LSX - .MASK_LSX_JRTABLE 1615 .hword .MASK_W16_LSX - .MASK_LSX_JRTABLE 1616 .hword .MASK_W8_LSX - .MASK_LSX_JRTABLE 1617 .hword .MASK_W4_LSX - .MASK_LSX_JRTABLE 1618 1619.MASK_W4_LSX: 1620 vld vr0, a2, 0 1621 vld vr1, a3, 0 1622 fld.d f22, a6, 0 1623 1624 vilvl.b vr2, vr19, vr22 1625 vsub.h vr3, vr21, vr2 1626 1627 vmulwev.w.h vr4, vr0, vr2 1628 vmulwod.w.h vr5, vr0, vr2 1629 vmaddwev.w.h vr4, vr1, vr3 1630 vmaddwod.w.h vr5, vr1, vr3 1631 vssrarni.hu.w vr5, vr4, mask_sh 1632 vssrlrni.bu.h vr1, vr5, 0 1633 vpickod.w vr4, vr2, vr1 1634 vilvl.b vr0, vr4, vr1 1635 fst.s f0, a0, 0 1636 add.d a0, a0, a1 1637 vstelm.w vr0, a0, 0, 1 1638 addi.d a2, a2, 16 1639 addi.d a3, a3, 16 1640 addi.d a6, a6, 8 1641 add.d a0, a0, a1 1642 addi.w a5, a5, -2 1643 blt zero, a5, .MASK_W4_LSX 1644 b .MASK_END_LSX 1645.MASK_W8_LSX: 1646 vld vr0, a2, 0 1647 vld vr10, a2, 16 1648 vld vr1, a3, 0 1649 vld vr11, a3, 16 1650 vld vr22, a6, 0 1651 1652 vilvl.b vr2, vr19, vr22 1653 vilvh.b vr12, vr19, vr22 1654 vsub.h vr3, vr21, vr2 1655 vsub.h vr13, vr21, vr12 1656 1657 vmulwev.w.h vr4, vr0, vr2 1658 vmulwod.w.h vr5, vr0, vr2 1659 vmulwev.w.h vr14, vr10, vr12 1660 vmulwod.w.h vr15, vr10, vr12 1661 vmaddwev.w.h vr4, vr1, vr3 1662 vmaddwod.w.h vr5, vr1, vr3 1663 vmaddwev.w.h vr14, vr11, vr13 1664 vmaddwod.w.h vr15, vr11, vr13 1665 vssrarni.hu.w vr14, vr4, mask_sh 1666 vssrarni.hu.w vr15, vr5, mask_sh 1667 vssrlrni.bu.h vr15, vr14, 0 1668 vshuf4i.w vr6, vr15, 0x4E 1669 vilvl.b vr0, vr6, vr15 1670 fst.d f0, a0, 0 1671 add.d a0, a0, a1 1672 vstelm.d vr0, a0, 0, 1 1673 addi.d a2, a2, 32 1674 addi.d a3, a3, 32 1675 addi.d a6, a6, 16 1676 add.d a0, a0, a1 1677 addi.w a5, a5, -2 1678 blt zero, a5, .MASK_W8_LSX 1679 b .MASK_END_LSX 1680 1681.MASK_W16_LSX: 1682 vld vr0, a2, 0 1683 vld vr10, a2, 16 1684 vld vr1, a3, 0 1685 vld vr11, a3, 16 1686 vld vr22, a6, 0 1687 1688 vilvl.b vr2, vr19, vr22 1689 vilvh.b vr12, vr19, vr22 1690 vsub.h vr3, vr21, vr2 1691 vsub.h vr13, vr21, vr12 1692 1693 vmulwev.w.h vr4, vr0, vr2 1694 vmulwod.w.h vr5, vr0, vr2 1695 vmulwev.w.h vr14, vr10, vr12 1696 vmulwod.w.h vr15, vr10, vr12 1697 vmaddwev.w.h vr4, vr1, vr3 1698 vmaddwod.w.h vr5, vr1, vr3 1699 vmaddwev.w.h vr14, vr11, vr13 1700 vmaddwod.w.h vr15, vr11, vr13 1701 vssrarni.hu.w vr14, vr4, mask_sh 1702 vssrarni.hu.w vr15, vr5, mask_sh 1703 vssrlrni.bu.h vr15, vr14, 0 1704 vshuf4i.w vr6, vr15, 0x4E 1705 vilvl.b vr0, vr6, vr15 1706 vst vr0, a0, 0 1707 addi.d a2, a2, 32 1708 addi.d a3, a3, 32 1709 addi.d a6, a6, 16 1710 add.d a0, a0, a1 1711 addi.w a5, a5, -1 1712 blt zero, a5, .MASK_W16_LSX 1713 b .MASK_END_LSX 1714.MASK_W32_LSX: 1715.rept 2 1716 vld vr0, a2, 0 1717 vld vr10, a2, 16 1718 vld vr1, a3, 0 1719 vld vr11, a3, 16 1720 vld vr22, a6, 0 1721 vilvl.b vr2, vr19, vr22 1722 vilvh.b vr12, vr19, vr22 1723 vsub.h vr3, vr21, vr2 1724 vsub.h vr13, vr21, vr12 1725 vmulwev.w.h vr4, vr0, vr2 1726 vmulwod.w.h vr5, vr0, vr2 1727 vmulwev.w.h vr14, vr10, vr12 1728 vmulwod.w.h vr15, vr10, vr12 1729 vmaddwev.w.h vr4, vr1, vr3 1730 vmaddwod.w.h vr5, vr1, vr3 1731 vmaddwev.w.h vr14, vr11, vr13 1732 vmaddwod.w.h vr15, vr11, vr13 1733 vssrarni.hu.w vr14, vr4, mask_sh 1734 vssrarni.hu.w vr15, vr5, mask_sh 1735 vssrlrni.bu.h vr15, vr14, 0 1736 vshuf4i.w vr6, vr15, 0x4E 1737 vilvl.b vr0, vr6, vr15 1738 vst vr0, a0, 0 1739 addi.d a2, a2, 32 1740 addi.d a3, a3, 32 1741 addi.d a6, a6, 16 1742 addi.d a0, a0, 16 1743.endr 1744 add.d t8, t8, a1 1745 add.d a0, t8, zero 1746 addi.w a5, a5, -1 1747 blt zero, a5, .MASK_W32_LSX 1748 b .MASK_END_LSX 1749.MASK_W64_LSX: 1750.rept 4 1751 vld vr0, a2, 0 1752 vld vr10, a2, 16 1753 vld vr1, a3, 0 1754 vld vr11, a3, 16 1755 vld vr22, a6, 0 1756 vilvl.b vr2, vr19, vr22 1757 vilvh.b vr12, vr19, vr22 1758 vsub.h vr3, vr21, vr2 1759 vsub.h vr13, vr21, vr12 1760 vmulwev.w.h vr4, vr0, vr2 1761 vmulwod.w.h vr5, vr0, vr2 1762 vmulwev.w.h vr14, vr10, vr12 1763 vmulwod.w.h vr15, vr10, vr12 1764 vmaddwev.w.h vr4, vr1, vr3 1765 vmaddwod.w.h vr5, vr1, vr3 1766 vmaddwev.w.h vr14, vr11, vr13 1767 vmaddwod.w.h vr15, vr11, vr13 1768 vssrarni.hu.w vr14, vr4, mask_sh 1769 vssrarni.hu.w vr15, vr5, mask_sh 1770 vssrlrni.bu.h vr15, vr14, 0 1771 vshuf4i.w vr6, vr15, 0x4E 1772 vilvl.b vr0, vr6, vr15 1773 vst vr0, a0, 0 1774 addi.d a2, a2, 32 1775 addi.d a3, a3, 32 1776 addi.d a6, a6, 16 1777 addi.d a0, a0, 16 1778.endr 1779 add.d t8, t8, a1 1780 add.d a0, t8, zero 1781 addi.w a5, a5, -1 1782 blt zero, a5, .MASK_W64_LSX 1783 b .MASK_END_LSX 1784.MASK_W128_LSX: 1785.rept 8 1786 vld vr0, a2, 0 1787 vld vr10, a2, 16 1788 vld vr1, a3, 0 1789 vld vr11, a3, 16 1790 vld vr22, a6, 0 1791 vilvl.b vr2, vr19, vr22 1792 vilvh.b vr12, vr19, vr22 1793 vsub.h vr3, vr21, vr2 1794 vsub.h vr13, vr21, vr12 1795 vmulwev.w.h vr4, vr0, vr2 1796 vmulwod.w.h vr5, vr0, vr2 1797 vmulwev.w.h vr14, vr10, vr12 1798 vmulwod.w.h vr15, vr10, vr12 1799 vmaddwev.w.h vr4, vr1, vr3 1800 vmaddwod.w.h vr5, vr1, vr3 1801 vmaddwev.w.h vr14, vr11, vr13 1802 vmaddwod.w.h vr15, vr11, vr13 1803 vssrarni.hu.w vr14, vr4, mask_sh 1804 vssrarni.hu.w vr15, vr5, mask_sh 1805 vssrlrni.bu.h vr15, vr14, 0 1806 vshuf4i.w vr6, vr15, 0x4E 1807 vilvl.b vr0, vr6, vr15 1808 vst vr0, a0, 0 1809 addi.d a2, a2, 32 1810 addi.d a3, a3, 32 1811 addi.d a6, a6, 16 1812 addi.d a0, a0, 16 1813.endr 1814 add.d t8, t8, a1 1815 add.d a0, t8, zero 1816 addi.w a5, a5, -1 1817 blt zero, a5, .MASK_W128_LSX 1818.MASK_END_LSX: 1819endfunc 1820 1821function mask_8bpc_lasx 1822 xvldi xr21, 0x440 // 64 1823 xvxor.v xr19, xr19, xr19 1824 addi.d t8, a0, 0 1825 clz.w t0, a4 1826 li.w t1, 24 1827 sub.w t0, t0, t1 1828 la.local t1, .MASK_LASX_JRTABLE 1829 alsl.d t0, t0, t1, 1 1830 ld.h t2, t0, 0 1831 add.d t1, t1, t2 1832 jirl $r0, t1, 0 1833 1834 .align 3 1835.MASK_LASX_JRTABLE: 1836 .hword .MASK_W128_LASX - .MASK_LASX_JRTABLE 1837 .hword .MASK_W64_LASX - .MASK_LASX_JRTABLE 1838 .hword .MASK_W32_LASX - .MASK_LASX_JRTABLE 1839 .hword .MASK_W16_LASX - .MASK_LASX_JRTABLE 1840 .hword .MASK_W8_LASX - .MASK_LASX_JRTABLE 1841 .hword .MASK_W4_LASX - .MASK_LASX_JRTABLE 1842 1843.MASK_W4_LASX: 1844 vld vr0, a2, 0 1845 vld vr1, a3, 0 1846 fld.d f22, a6, 0 1847 1848 vilvl.h vr4, vr1, vr0 1849 vilvh.h vr14, vr1, vr0 1850 vilvl.b vr2, vr19, vr22 1851 vsub.h vr3, vr21, vr2 1852 xvpermi.q xr14, xr4, 0x20 1853 vilvl.h vr5, vr3, vr2 1854 vilvh.h vr15, vr3, vr2 1855 xvpermi.q xr15, xr5, 0x20 1856 xvmulwev.w.h xr0, xr14, xr15 1857 xvmaddwod.w.h xr0, xr14, xr15 1858 xvssrarni.hu.w xr1, xr0, mask_sh 1859 xvssrlni.bu.h xr2, xr1, 0 1860 fst.s f2, a0, 0 1861 add.d a0, a0, a1 1862 xvstelm.w xr2, a0, 0, 4 1863 1864 addi.d a2, a2, 16 1865 addi.d a3, a3, 16 1866 addi.d a6, a6, 8 1867 add.d a0, a0, a1 1868 addi.w a5, a5, -2 1869 blt zero, a5, .MASK_W4_LASX 1870 b .MASK_END_LASX 1871 1872.MASK_W8_LASX: 1873 xvld xr0, a2, 0 1874 xvld xr1, a3, 0 1875 vld vr22, a6, 0 1876 1877 vext2xv.hu.bu xr2, xr22 1878 xvsub.h xr3, xr21, xr2 1879 xvmulwev.w.h xr4, xr0, xr2 1880 xvmulwod.w.h xr5, xr0, xr2 1881 xvmaddwev.w.h xr4, xr1, xr3 1882 xvmaddwod.w.h xr5, xr1, xr3 1883 xvssrarni.hu.w xr5, xr4, mask_sh 1884 xvssrlni.bu.h xr1, xr5, 0 1885 xvpickod.w xr4, xr2, xr1 1886 xvilvl.b xr0, xr4, xr1 1887 fst.d f0, a0, 0 1888 add.d a0, a0, a1 1889 xvstelm.d xr0, a0, 0, 2 1890 1891 addi.d a2, a2, 32 1892 addi.d a3, a3, 32 1893 addi.d a6, a6, 16 1894 add.d a0, a0, a1 1895 addi.w a5, a5, -2 1896 blt zero, a5, .MASK_W8_LASX 1897 b .MASK_END_LASX 1898 1899.MASK_W16_LASX: 1900 xvld xr0, a2, 0 1901 xvld xr1, a3, 0 1902 vld vr22, a6, 0 1903 1904 vext2xv.hu.bu xr2, xr22 1905 xvsub.h xr3, xr21, xr2 1906 xvmulwev.w.h xr4, xr0, xr2 1907 xvmulwod.w.h xr5, xr0, xr2 1908 xvmaddwev.w.h xr4, xr1, xr3 1909 xvmaddwod.w.h xr5, xr1, xr3 1910 xvssrarni.hu.w xr5, xr4, mask_sh 1911 xvssrlni.bu.h xr1, xr5, 0 1912 xvpickod.w xr4, xr2, xr1 1913 xvilvl.b xr0, xr4, xr1 1914 xvpermi.d xr1, xr0, 0xD8 1915 vst vr1, a0, 0 1916 1917 addi.d a2, a2, 32 1918 addi.d a3, a3, 32 1919 addi.d a6, a6, 16 1920 add.d a0, a0, a1 1921 addi.w a5, a5, -1 1922 blt zero, a5, .MASK_W16_LASX 1923 b .MASK_END_LASX 1924.MASK_W32_LASX: 1925 xvld xr0, a2, 0 1926 xvld xr10, a2, 32 1927 xvld xr1, a3, 0 1928 xvld xr11, a3, 32 1929 xvld xr22, a6, 0 1930 vext2xv.hu.bu xr2, xr22 1931 xvpermi.q xr4, xr22, 0x01 1932 vext2xv.hu.bu xr12, xr4 1933 xvsub.h xr3, xr21, xr2 1934 xvsub.h xr13, xr21, xr12 1935 1936 xvmulwev.w.h xr4, xr0, xr2 1937 xvmulwod.w.h xr5, xr0, xr2 1938 xvmulwev.w.h xr14, xr10, xr12 1939 xvmulwod.w.h xr15, xr10, xr12 1940 xvmaddwev.w.h xr4, xr1, xr3 1941 xvmaddwod.w.h xr5, xr1, xr3 1942 xvmaddwev.w.h xr14, xr11, xr13 1943 xvmaddwod.w.h xr15, xr11, xr13 1944 xvssrarni.hu.w xr14, xr4, mask_sh 1945 xvssrarni.hu.w xr15, xr5, mask_sh 1946 xvssrlni.bu.h xr15, xr14, 0 1947 xvshuf4i.w xr6, xr15, 0x4E 1948 xvilvl.b xr1, xr6, xr15 1949 xvpermi.d xr0, xr1, 0xD8 1950 xvst xr0, a0, 0 1951 1952 addi.d a2, a2, 64 1953 addi.d a3, a3, 64 1954 addi.d a6, a6, 32 1955 add.d a0, a0, a1 1956 addi.w a5, a5, -1 1957 blt zero, a5, .MASK_W32_LASX 1958 b .MASK_END_LASX 1959 1960.MASK_W64_LASX: 1961.rept 2 1962 xvld xr0, a2, 0 1963 xvld xr10, a2, 32 1964 xvld xr1, a3, 0 1965 xvld xr11, a3, 32 1966 xvld xr22, a6, 0 1967 vext2xv.hu.bu xr2, xr22 1968 xvpermi.q xr4, xr22, 0x01 1969 vext2xv.hu.bu xr12, xr4 1970 xvsub.h xr3, xr21, xr2 1971 xvsub.h xr13, xr21, xr12 1972 1973 xvmulwev.w.h xr4, xr0, xr2 1974 xvmulwod.w.h xr5, xr0, xr2 1975 xvmulwev.w.h xr14, xr10, xr12 1976 xvmulwod.w.h xr15, xr10, xr12 1977 xvmaddwev.w.h xr4, xr1, xr3 1978 xvmaddwod.w.h xr5, xr1, xr3 1979 xvmaddwev.w.h xr14, xr11, xr13 1980 xvmaddwod.w.h xr15, xr11, xr13 1981 xvssrarni.hu.w xr14, xr4, mask_sh 1982 xvssrarni.hu.w xr15, xr5, mask_sh 1983 xvssrlni.bu.h xr15, xr14, 0 1984 xvshuf4i.w xr6, xr15, 0x4E 1985 xvilvl.b xr1, xr6, xr15 1986 xvpermi.d xr0, xr1, 0xD8 1987 xvst xr0, a0, 0 1988 addi.d a2, a2, 64 1989 addi.d a3, a3, 64 1990 addi.d a6, a6, 32 1991 addi.d a0, a0, 32 1992.endr 1993 add.d t8, t8, a1 1994 add.d a0, t8, zero 1995 addi.w a5, a5, -1 1996 blt zero, a5, .MASK_W64_LASX 1997 b .MASK_END_LASX 1998 1999.MASK_W128_LASX: 2000.rept 4 2001 xvld xr0, a2, 0 2002 xvld xr10, a2, 32 2003 xvld xr1, a3, 0 2004 xvld xr11, a3, 32 2005 xvld xr22, a6, 0 2006 vext2xv.hu.bu xr2, xr22 2007 xvpermi.q xr4, xr22, 0x01 2008 vext2xv.hu.bu xr12, xr4 2009 xvsub.h xr3, xr21, xr2 2010 xvsub.h xr13, xr21, xr12 2011 2012 xvmulwev.w.h xr4, xr0, xr2 2013 xvmulwod.w.h xr5, xr0, xr2 2014 xvmulwev.w.h xr14, xr10, xr12 2015 xvmulwod.w.h xr15, xr10, xr12 2016 xvmaddwev.w.h xr4, xr1, xr3 2017 xvmaddwod.w.h xr5, xr1, xr3 2018 xvmaddwev.w.h xr14, xr11, xr13 2019 xvmaddwod.w.h xr15, xr11, xr13 2020 xvssrarni.hu.w xr14, xr4, mask_sh 2021 xvssrarni.hu.w xr15, xr5, mask_sh 2022 xvssrlni.bu.h xr15, xr14, 0 2023 xvshuf4i.w xr6, xr15, 0x4E 2024 xvilvl.b xr1, xr6, xr15 2025 xvpermi.d xr0, xr1, 0xD8 2026 xvst xr0, a0, 0 2027 2028 addi.d a2, a2, 64 2029 addi.d a3, a3, 64 2030 addi.d a6, a6, 32 2031 addi.d a0, a0, 32 2032.endr 2033 add.d t8, t8, a1 2034 add.d a0, t8, zero 2035 addi.w a5, a5, -1 2036 blt zero, a5, .MASK_W128_LASX 2037.MASK_END_LASX: 2038endfunc 2039 2040/* 2041static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride, 2042 const int16_t *tmp1, const int16_t *tmp2, const int w, int h, 2043 uint8_t *mask, const int sign, 2044 const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX) 2045*/ 2046function w_mask_420_8bpc_lsx 2047 addi.d sp, sp, -24 2048 fst.d f24, sp, 0 2049 fst.d f25, sp, 8 2050 fst.d f26, sp, 16 2051 vldi vr20, 0x440 2052 vreplgr2vr.h vr21, a7 2053 vldi vr22, 0x426 2054 2055 clz.w t0, a4 2056 li.w t1, 24 2057 sub.w t0, t0, t1 2058 la.local t1, .WMASK420_LSX_JRTABLE 2059 alsl.d t0, t0, t1, 1 2060 ld.h t8, t0, 0 2061 add.d t1, t1, t8 2062 jirl $r0, t1, 0 2063 2064 .align 3 2065.WMASK420_LSX_JRTABLE: 2066 .hword .WMASK420_W128_LSX - .WMASK420_LSX_JRTABLE 2067 .hword .WMASK420_W64_LSX - .WMASK420_LSX_JRTABLE 2068 .hword .WMASK420_W32_LSX - .WMASK420_LSX_JRTABLE 2069 .hword .WMASK420_W16_LSX - .WMASK420_LSX_JRTABLE 2070 .hword .WMASK420_W8_LSX - .WMASK420_LSX_JRTABLE 2071 .hword .WMASK420_W4_LSX - .WMASK420_LSX_JRTABLE 2072 2073.WMASK420_W4_LSX: 2074 vld vr0, a2, 0 2075 vld vr1, a2, 16 2076 vld vr2, a3, 0 2077 vld vr3, a3, 16 2078 addi.w a5, a5, -4 2079 2080 vabsd.h vr4, vr0, vr2 2081 vabsd.h vr5, vr1, vr3 2082 vaddi.hu vr4, vr4, 8 2083 vaddi.hu vr5, vr5, 8 2084 vsrli.h vr4, vr4, 8 2085 vsrli.h vr5, vr5, 8 2086 vadd.h vr4, vr4, vr22 2087 vadd.h vr5, vr5, vr22 2088 vmin.hu vr6, vr4, vr20 2089 vmin.hu vr7, vr5, vr20 2090 vsub.h vr8, vr20, vr6 2091 vsub.h vr9, vr20, vr7 2092 vmulwev.w.h vr4, vr6, vr0 2093 vmulwod.w.h vr5, vr6, vr0 2094 vmulwev.w.h vr10, vr7, vr1 2095 vmulwod.w.h vr11, vr7, vr1 2096 vmaddwev.w.h vr4, vr8, vr2 2097 vmaddwod.w.h vr5, vr8, vr2 2098 vmaddwev.w.h vr10, vr9, vr3 2099 vmaddwod.w.h vr11, vr9, vr3 2100 vilvl.w vr0, vr5, vr4 2101 vilvh.w vr1, vr5, vr4 2102 vilvl.w vr2, vr11, vr10 2103 vilvh.w vr3, vr11, vr10 2104 vssrarni.hu.w vr1, vr0, 10 2105 vssrarni.hu.w vr3, vr2, 10 2106 vssrlni.bu.h vr3, vr1, 0 2107 vstelm.w vr3, a0, 0, 0 2108 add.d a0, a0, a1 2109 vstelm.w vr3, a0, 0, 1 2110 add.d a0, a0, a1 2111 vstelm.w vr3, a0, 0, 2 2112 add.d a0, a0, a1 2113 vstelm.w vr3, a0, 0, 3 2114 add.d a0, a0, a1 2115 vpickev.h vr0, vr7, vr6 2116 vpickod.h vr1, vr7, vr6 2117 vadd.h vr0, vr0, vr1 2118 vshuf4i.h vr0, vr0, 0xd8 2119 vhaddw.w.h vr2, vr0, vr0 2120 vpickev.h vr2, vr2, vr2 2121 vsub.h vr2, vr2, vr21 2122 vaddi.hu vr2, vr2, 2 2123 vssrani.bu.h vr2, vr2, 2 2124 vstelm.w vr2, a6, 0, 0 2125 2126 addi.d a2, a2, 32 2127 addi.d a3, a3, 32 2128 addi.d a6, a6, 4 2129 blt zero, a5, .WMASK420_W4_LSX 2130 b .END_W420 2131 2132.WMASK420_W8_LSX: 2133 vld vr0, a2, 0 2134 vld vr1, a2, 16 2135 vld vr2, a3, 0 2136 vld vr3, a3, 16 2137 addi.w a5, a5, -2 2138 2139 vabsd.h vr4, vr0, vr2 2140 vabsd.h vr5, vr1, vr3 2141 vaddi.hu vr4, vr4, 8 2142 vaddi.hu vr5, vr5, 8 2143 vsrli.h vr4, vr4, 8 2144 vsrli.h vr5, vr5, 8 2145 vadd.h vr4, vr4, vr22 2146 vadd.h vr5, vr5, vr22 2147 vmin.hu vr6, vr4, vr20 2148 vmin.hu vr7, vr5, vr20 2149 vsub.h vr8, vr20, vr6 2150 vsub.h vr9, vr20, vr7 2151 vmulwev.w.h vr4, vr6, vr0 2152 vmulwod.w.h vr5, vr6, vr0 2153 vmulwev.w.h vr10, vr7, vr1 2154 vmulwod.w.h vr11, vr7, vr1 2155 vmaddwev.w.h vr4, vr8, vr2 2156 vmaddwod.w.h vr5, vr8, vr2 2157 vmaddwev.w.h vr10, vr9, vr3 2158 vmaddwod.w.h vr11, vr9, vr3 2159 vssrarni.hu.w vr10, vr4, 10 2160 vssrarni.hu.w vr11, vr5, 10 2161 vssrlni.bu.h vr11, vr10, 0 2162 vshuf4i.w vr0, vr11, 0x4E 2163 vilvl.b vr3, vr0, vr11 2164 vstelm.d vr3, a0, 0, 0 2165 add.d a0, a0, a1 2166 vstelm.d vr3, a0, 0, 1 2167 add.d a0, a0, a1 2168 vpickev.h vr0, vr7, vr6 2169 vpickod.h vr1, vr7, vr6 2170 vadd.h vr0, vr0, vr1 2171 vilvh.d vr2, vr0, vr0 2172 vadd.h vr2, vr2, vr0 2173 vsub.h vr2, vr2, vr21 2174 vaddi.hu vr2, vr2, 2 2175 vssrani.bu.h vr2, vr2, 2 2176 vstelm.w vr2, a6, 0, 0 2177 2178 addi.d a2, a2, 32 2179 addi.d a3, a3, 32 2180 addi.d a6, a6, 4 2181 blt zero, a5, .WMASK420_W8_LSX 2182 b .END_W420 2183 2184.WMASK420_W16_LSX: 2185 vld vr0, a2, 0 2186 vld vr1, a2, 16 2187 alsl.d a2, a4, a2, 1 2188 vld vr2, a2, 0 2189 vld vr3, a2, 16 2190 vld vr4, a3, 0 2191 vld vr5, a3, 16 2192 alsl.d a3, a4, a3, 1 2193 vld vr6, a3, 0 2194 vld vr7, a3, 16 2195 2196 vabsd.h vr8, vr0, vr4 2197 vabsd.h vr9, vr1, vr5 2198 vabsd.h vr10, vr2, vr6 2199 vabsd.h vr11, vr3, vr7 2200 vaddi.hu vr8, vr8, 8 2201 vaddi.hu vr9, vr9, 8 2202 vaddi.hu vr10, vr10, 8 2203 vaddi.hu vr11, vr11, 8 2204 vsrli.h vr8, vr8, 8 2205 vsrli.h vr9, vr9, 8 2206 vsrli.h vr10, vr10, 8 2207 vsrli.h vr11, vr11, 8 2208 vadd.h vr8, vr8, vr22 2209 vadd.h vr9, vr9, vr22 2210 vadd.h vr10, vr10, vr22 2211 vadd.h vr11, vr11, vr22 2212 vmin.hu vr12, vr8, vr20 2213 vmin.hu vr13, vr9, vr20 2214 vmin.hu vr14, vr10, vr20 2215 vmin.hu vr15, vr11, vr20 2216 vsub.h vr16, vr20, vr12 2217 vsub.h vr17, vr20, vr13 2218 vsub.h vr18, vr20, vr14 2219 vsub.h vr19, vr20, vr15 2220 vmulwev.w.h vr8, vr12, vr0 2221 vmulwod.w.h vr9, vr12, vr0 2222 vmulwev.w.h vr10, vr13, vr1 2223 vmulwod.w.h vr11, vr13, vr1 2224 vmulwev.w.h vr23, vr14, vr2 2225 vmulwod.w.h vr24, vr14, vr2 2226 vmulwev.w.h vr25, vr15, vr3 2227 vmulwod.w.h vr26, vr15, vr3 2228 vmaddwev.w.h vr8, vr16, vr4 2229 vmaddwod.w.h vr9, vr16, vr4 2230 vmaddwev.w.h vr10, vr17, vr5 2231 vmaddwod.w.h vr11, vr17, vr5 2232 vmaddwev.w.h vr23, vr18, vr6 2233 vmaddwod.w.h vr24, vr18, vr6 2234 vmaddwev.w.h vr25, vr19, vr7 2235 vmaddwod.w.h vr26, vr19, vr7 2236 vssrarni.hu.w vr10, vr8, 10 2237 vssrarni.hu.w vr11, vr9, 10 2238 vssrarni.hu.w vr25, vr23, 10 2239 vssrarni.hu.w vr26, vr24, 10 2240 vssrlni.bu.h vr11, vr10, 0 2241 vssrlni.bu.h vr26, vr25, 0 2242 vshuf4i.w vr0, vr11, 0x4E 2243 vshuf4i.w vr1, vr26, 0x4E 2244 vilvl.b vr3, vr0, vr11 2245 vilvl.b vr7, vr1, vr26 2246 vst vr3, a0, 0 2247 vstx vr7, a0, a1 2248 vpickev.h vr0, vr13, vr12 2249 vpickod.h vr1, vr13, vr12 2250 vpickev.h vr2, vr15, vr14 2251 vpickod.h vr3, vr15, vr14 2252 vadd.h vr4, vr0, vr1 2253 vadd.h vr5, vr2, vr3 2254 vadd.h vr4, vr4, vr5 2255 vsub.h vr4, vr4, vr21 2256 vssrarni.bu.h vr4, vr4, 2 2257 vstelm.d vr4, a6, 0, 0 2258 2259 alsl.d a2, a4, a2, 1 2260 alsl.d a3, a4, a3, 1 2261 alsl.d a0, a1, a0, 1 2262 addi.d a6, a6, 8 2263 addi.w a5, a5, -2 2264 blt zero, a5, .WMASK420_W16_LSX 2265 b .END_W420 2266 2267.WMASK420_W32_LSX: 2268.WMASK420_W64_LSX: 2269.WMASK420_W128_LSX: 2270 2271.LOOP_W32_420_LSX: 2272 add.d t1, a2, zero 2273 add.d t2, a3, zero 2274 add.d t3, a0, zero 2275 add.d t4, a6, zero 2276 alsl.d t5, a4, t1, 1 2277 alsl.d t6, a4, t2, 1 2278 or t7, a4, a4 2279 2280.W32_420_LSX: 2281 vld vr0, t1, 0 2282 vld vr1, t1, 16 2283 vld vr2, t2, 0 2284 vld vr3, t2, 16 2285 vld vr4, t5, 0 2286 vld vr5, t5, 16 2287 vld vr6, t6, 0 2288 vld vr7, t6, 16 2289 addi.d t1, t1, 32 2290 addi.d t2, t2, 32 2291 addi.d t5, t5, 32 2292 addi.d t6, t6, 32 2293 addi.w t7, t7, -16 2294 vabsd.h vr8, vr0, vr2 2295 vabsd.h vr9, vr1, vr3 2296 vabsd.h vr10, vr4, vr6 2297 vabsd.h vr11, vr5, vr7 2298 vaddi.hu vr8, vr8, 8 2299 vaddi.hu vr9, vr9, 8 2300 vaddi.hu vr10, vr10, 8 2301 vaddi.hu vr11, vr11, 8 2302 vsrli.h vr8, vr8, 8 2303 vsrli.h vr9, vr9, 8 2304 vsrli.h vr10, vr10, 8 2305 vsrli.h vr11, vr11, 8 2306 vadd.h vr8, vr8, vr22 2307 vadd.h vr9, vr9, vr22 2308 vadd.h vr10, vr10, vr22 2309 vadd.h vr11, vr11, vr22 2310 vmin.hu vr12, vr8, vr20 2311 vmin.hu vr13, vr9, vr20 2312 vmin.hu vr14, vr10, vr20 2313 vmin.hu vr15, vr11, vr20 2314 vsub.h vr16, vr20, vr12 2315 vsub.h vr17, vr20, vr13 2316 vsub.h vr18, vr20, vr14 2317 vsub.h vr19, vr20, vr15 2318 vmulwev.w.h vr8, vr12, vr0 2319 vmulwod.w.h vr9, vr12, vr0 2320 vmulwev.w.h vr10, vr13, vr1 2321 vmulwod.w.h vr11, vr13, vr1 2322 vmulwev.w.h vr23, vr14, vr4 2323 vmulwod.w.h vr24, vr14, vr4 2324 vmulwev.w.h vr25, vr15, vr5 2325 vmulwod.w.h vr26, vr15, vr5 2326 vmaddwev.w.h vr8, vr16, vr2 2327 vmaddwod.w.h vr9, vr16, vr2 2328 vmaddwev.w.h vr10, vr17, vr3 2329 vmaddwod.w.h vr11, vr17, vr3 2330 vmaddwev.w.h vr23, vr18, vr6 2331 vmaddwod.w.h vr24, vr18, vr6 2332 vmaddwev.w.h vr25, vr19, vr7 2333 vmaddwod.w.h vr26, vr19, vr7 2334 vssrarni.hu.w vr10, vr8, 10 2335 vssrarni.hu.w vr11, vr9, 10 2336 vssrarni.hu.w vr25, vr23, 10 2337 vssrarni.hu.w vr26, vr24, 10 2338 vssrlni.bu.h vr11, vr10, 0 2339 vssrlni.bu.h vr26, vr25, 0 2340 vshuf4i.w vr8, vr11, 0x4E 2341 vshuf4i.w vr9, vr26, 0x4E 2342 vilvl.b vr3, vr8, vr11 2343 vilvl.b vr7, vr9, vr26 2344 vst vr3, t3, 0 2345 vstx vr7, a1, t3 2346 addi.d t3, t3, 16 2347 vpickev.h vr8, vr13, vr12 2348 vpickod.h vr9, vr13, vr12 2349 vpickev.h vr10, vr15, vr14 2350 vpickod.h vr11, vr15, vr14 2351 vadd.h vr8, vr8, vr9 2352 vadd.h vr10, vr10, vr11 2353 vadd.h vr12, vr8, vr10 2354 vsub.h vr12, vr12, vr21 2355 vssrarni.bu.h vr12, vr12, 2 2356 vstelm.d vr12, t4, 0, 0 2357 addi.d t4, t4, 8 2358 bne t7, zero, .W32_420_LSX 2359 2360 alsl.d a2, a4, a2, 2 2361 alsl.d a3, a4, a3, 2 2362 alsl.d a0, a1, a0, 1 2363 srai.w t8, a4, 1 2364 add.d a6, a6, t8 2365 addi.w a5, a5, -2 2366 blt zero, a5, .LOOP_W32_420_LSX 2367 2368.END_W420: 2369 fld.d f24, sp, 0 2370 fld.d f25, sp, 8 2371 fld.d f26, sp, 16 2372 addi.d sp, sp, 24 2373endfunc 2374 2375function w_mask_420_8bpc_lasx 2376 xvldi xr20, 0x440 2377 xvreplgr2vr.h xr21, a7 2378 xvldi xr22, 0x426 2379 2380 clz.w t0, a4 2381 li.w t1, 24 2382 sub.w t0, t0, t1 2383 la.local t1, .WMASK420_LASX_JRTABLE 2384 alsl.d t0, t0, t1, 1 2385 ld.h t8, t0, 0 2386 add.d t1, t1, t8 2387 jirl $r0, t1, 0 2388 2389 .align 3 2390.WMASK420_LASX_JRTABLE: 2391 .hword .WMASK420_W128_LASX - .WMASK420_LASX_JRTABLE 2392 .hword .WMASK420_W64_LASX - .WMASK420_LASX_JRTABLE 2393 .hword .WMASK420_W32_LASX - .WMASK420_LASX_JRTABLE 2394 .hword .WMASK420_W16_LASX - .WMASK420_LASX_JRTABLE 2395 .hword .WMASK420_W8_LASX - .WMASK420_LASX_JRTABLE 2396 .hword .WMASK420_W4_LASX - .WMASK420_LASX_JRTABLE 2397 2398.WMASK420_W4_LASX: 2399 xvld xr0, a2, 0 2400 xvld xr1, a3, 0 2401 addi.w a5, a5, -4 2402 2403 xvabsd.h xr2, xr0, xr1 2404 xvaddi.hu xr2, xr2, 8 2405 xvsrli.h xr2, xr2, 8 2406 xvadd.h xr2, xr2, xr22 2407 xvmin.hu xr3, xr2, xr20 2408 xvsub.h xr4, xr20, xr3 2409 xvmulwev.w.h xr5, xr3, xr0 2410 xvmulwod.w.h xr6, xr3, xr0 2411 xvmaddwev.w.h xr5, xr4, xr1 2412 xvmaddwod.w.h xr6, xr4, xr1 2413 xvilvl.w xr7, xr6, xr5 2414 xvilvh.w xr8, xr6, xr5 2415 xvssrarni.hu.w xr8, xr7, 10 2416 xvssrlni.bu.h xr9, xr8, 0 2417 vstelm.w vr9, a0, 0, 0 2418 add.d a0, a0, a1 2419 vstelm.w vr9, a0, 0, 1 2420 add.d a0, a0, a1 2421 xvstelm.w xr9, a0, 0, 4 2422 add.d a0, a0, a1 2423 xvstelm.w xr9, a0, 0, 5 2424 add.d a0, a0, a1 2425 2426 xvhaddw.w.h xr3, xr3, xr3 2427 xvpermi.d xr4, xr3, 0xb1 2428 xvadd.h xr3, xr3, xr4 2429 xvpickev.h xr3, xr3, xr3 2430 xvsub.h xr3, xr3, xr21 2431 xvssrarni.bu.h xr3, xr3, 2 2432 vstelm.h vr3, a6, 0, 0 2433 xvstelm.h xr3, a6, 2, 8 2434 2435 addi.d a2, a2, 32 2436 addi.d a3, a3, 32 2437 addi.d a6, a6, 4 2438 blt zero, a5, .WMASK420_W4_LASX 2439 b .END_W420_LASX 2440 2441.WMASK420_W8_LASX: 2442 xvld xr0, a2, 0 2443 xvld xr1, a2, 32 2444 xvld xr2, a3, 0 2445 xvld xr3, a3, 32 2446 addi.w a5, a5, -4 2447 2448 xvabsd.h xr4, xr0, xr2 2449 xvabsd.h xr5, xr1, xr3 2450 xvaddi.hu xr4, xr4, 8 2451 xvaddi.hu xr5, xr5, 8 2452 xvsrli.h xr4, xr4, 8 2453 xvsrli.h xr5, xr5, 8 2454 xvadd.h xr4, xr4, xr22 2455 xvadd.h xr5, xr5, xr22 2456 xvmin.hu xr6, xr4, xr20 2457 xvmin.hu xr7, xr5, xr20 2458 xvsub.h xr8, xr20, xr6 2459 xvsub.h xr9, xr20, xr7 2460 xvmulwev.w.h xr10, xr6, xr0 2461 xvmulwod.w.h xr11, xr6, xr0 2462 xvmulwev.w.h xr12, xr7, xr1 2463 xvmulwod.w.h xr13, xr7, xr1 2464 xvmaddwev.w.h xr10, xr8, xr2 2465 xvmaddwod.w.h xr11, xr8, xr2 2466 xvmaddwev.w.h xr12, xr9, xr3 2467 xvmaddwod.w.h xr13, xr9, xr3 2468 xvssrarni.hu.w xr12, xr10, 10 2469 xvssrarni.hu.w xr13, xr11, 10 2470 xvssrlni.bu.h xr13, xr12, 0 2471 xvshuf4i.w xr1, xr13, 0x4E 2472 xvilvl.b xr17, xr1, xr13 2473 vstelm.d vr17, a0, 0, 0 2474 add.d a0, a0, a1 2475 xvstelm.d xr17, a0, 0, 2 2476 add.d a0, a0, a1 2477 xvstelm.d xr17, a0, 0, 1 2478 add.d a0, a0, a1 2479 xvstelm.d xr17, a0, 0, 3 2480 add.d a0, a0, a1 2481 2482 xvhaddw.w.h xr6, xr6, xr6 2483 xvhaddw.w.h xr7, xr7, xr7 2484 xvpickev.h xr8, xr7, xr6 2485 xvpermi.q xr9, xr8, 0x01 2486 vadd.h vr8, vr8, vr9 2487 vsub.h vr8, vr8, vr21 2488 vssrarni.bu.h vr8, vr8, 2 2489 vstelm.d vr8, a6, 0, 0 2490 addi.d a2, a2, 64 2491 addi.d a3, a3, 64 2492 addi.d a6, a6, 8 2493 blt zero, a5, .WMASK420_W8_LASX 2494 b .END_W420_LASX 2495 2496.WMASK420_W16_LASX: 2497 xvld xr0, a2, 0 2498 xvld xr1, a2, 32 2499 xvld xr2, a3, 0 2500 xvld xr3, a3, 32 2501 addi.w a5, a5, -2 2502 2503 xvabsd.h xr4, xr0, xr2 2504 xvabsd.h xr5, xr1, xr3 2505 xvaddi.hu xr4, xr4, 8 2506 xvaddi.hu xr5, xr5, 8 2507 xvsrli.h xr4, xr4, 8 2508 xvsrli.h xr5, xr5, 8 2509 xvadd.h xr4, xr4, xr22 2510 xvadd.h xr5, xr5, xr22 2511 xvmin.hu xr4, xr4, xr20 2512 xvmin.hu xr5, xr5, xr20 2513 xvsub.h xr6, xr20, xr4 2514 xvsub.h xr7, xr20, xr5 2515 xvmulwev.w.h xr8, xr4, xr0 2516 xvmulwod.w.h xr9, xr4, xr0 2517 xvmulwev.w.h xr10, xr5, xr1 2518 xvmulwod.w.h xr11, xr5, xr1 2519 xvmaddwev.w.h xr8, xr6, xr2 2520 xvmaddwod.w.h xr9, xr6, xr2 2521 xvmaddwev.w.h xr10, xr7, xr3 2522 xvmaddwod.w.h xr11, xr7, xr3 2523 xvssrarni.hu.w xr10, xr8, 10 2524 xvssrarni.hu.w xr11, xr9, 10 2525 xvssrlni.bu.h xr11, xr10, 0 2526 xvshuf4i.w xr8, xr11, 0x4E 2527 xvilvl.b xr15, xr8, xr11 2528 xvpermi.d xr16, xr15, 0xd8 2529 vst vr16, a0, 0 2530 add.d a0, a0, a1 2531 xvpermi.q xr16, xr16, 0x01 2532 vst vr16, a0, 0 2533 add.d a0, a0, a1 2534 2535 xvhaddw.w.h xr4, xr4, xr4 2536 xvhaddw.w.h xr5, xr5, xr5 2537 xvadd.h xr4, xr5, xr4 2538 xvpickev.h xr6, xr4, xr4 2539 xvpermi.d xr7, xr6, 0x08 2540 vsub.h vr7, vr7, vr21 2541 vssrarni.bu.h vr7, vr7, 2 2542 vstelm.d vr7, a6, 0, 0 2543 2544 addi.d a2, a2, 64 2545 addi.d a3, a3, 64 2546 addi.d a6, a6, 8 2547 blt zero, a5, .WMASK420_W16_LASX 2548 b .END_W420_LASX 2549 2550.WMASK420_W32_LASX: 2551.WMASK420_W64_LASX: 2552.WMASK420_W128_LASX: 2553 2554.LOOP_W32_420_LASX: 2555 add.d t1, a2, zero 2556 add.d t2, a3, zero 2557 add.d t3, a0, zero 2558 add.d t4, a6, zero 2559 alsl.d t5, a4, t1, 1 2560 alsl.d t6, a4, t2, 1 2561 or t7, a4, a4 2562.W32_420_LASX: 2563 xvld xr0, t1, 0 2564 xvld xr1, t2, 0 2565 xvld xr2, t5, 0 2566 xvld xr3, t6, 0 2567 addi.d t1, t1, 32 2568 addi.d t2, t2, 32 2569 addi.d t5, t5, 32 2570 addi.d t6, t6, 32 2571 addi.w t7, t7, -16 2572 xvabsd.h xr4, xr0, xr1 2573 xvabsd.h xr5, xr2, xr3 2574 xvaddi.hu xr4, xr4, 8 2575 xvaddi.hu xr5, xr5, 8 2576 xvsrli.h xr4, xr4, 8 2577 xvsrli.h xr5, xr5, 8 2578 xvadd.h xr4, xr4, xr22 2579 xvadd.h xr5, xr5, xr22 2580 xvmin.hu xr6, xr4, xr20 2581 xvmin.hu xr7, xr5, xr20 2582 xvsub.h xr8, xr20, xr6 2583 xvsub.h xr9, xr20, xr7 2584 xvmulwev.w.h xr10, xr6, xr0 2585 xvmulwod.w.h xr11, xr6, xr0 2586 xvmulwev.w.h xr12, xr7, xr2 2587 xvmulwod.w.h xr13, xr7, xr2 2588 xvmaddwev.w.h xr10, xr8, xr1 2589 xvmaddwod.w.h xr11, xr8, xr1 2590 xvmaddwev.w.h xr12, xr9, xr3 2591 xvmaddwod.w.h xr13, xr9, xr3 2592 xvssrarni.hu.w xr12, xr10, 10 2593 xvssrarni.hu.w xr13, xr11, 10 2594 xvssrlni.bu.h xr13, xr12, 0 2595 xvshuf4i.w xr10, xr13, 0x4E 2596 xvilvl.b xr17, xr10, xr13 2597 xvpermi.d xr18, xr17, 0x08 2598 xvpermi.d xr19, xr17, 0x0d 2599 vst vr18, t3, 0 2600 vstx vr19, t3, a1 2601 addi.d t3, t3, 16 2602 2603 xvhaddw.w.h xr6, xr6, xr6 2604 xvhaddw.w.h xr7, xr7, xr7 2605 xvadd.h xr6, xr7, xr6 2606 xvpickev.h xr7, xr6, xr6 2607 xvpermi.d xr8, xr7, 0x08 2608 vsub.h vr9, vr8, vr21 2609 vssrarni.bu.h vr9, vr9, 2 2610 vstelm.d vr9, t4, 0, 0 2611 addi.d t4, t4, 8 2612 bne t7, zero, .W32_420_LASX 2613 2614 alsl.d a2, a4, a2, 2 2615 alsl.d a3, a4, a3, 2 2616 alsl.d a0, a1, a0, 1 2617 srai.w t8, a4, 1 2618 add.d a6, a6, t8 2619 addi.w a5, a5, -2 2620 blt zero, a5, .LOOP_W32_420_LASX 2621 2622.END_W420_LASX: 2623endfunc 2624 2625#undef bpc_sh 2626#undef bpcw_sh 2627 2628.macro vhaddw.d.h in0 2629 vhaddw.w.h \in0, \in0, \in0 2630 vhaddw.d.w \in0, \in0, \in0 2631.endm 2632.macro vhaddw.q.w in0 2633 vhaddw.d.w \in0, \in0, \in0 2634 vhaddw.q.d \in0, \in0, \in0 2635.endm 2636.macro PUT_H_8W in0 2637 vbsrl.v vr2, \in0, 1 2638 vbsrl.v vr3, \in0, 2 2639 vbsrl.v vr4, \in0, 3 2640 vbsrl.v vr5, \in0, 4 2641 vbsrl.v vr6, \in0, 5 2642 vbsrl.v vr7, \in0, 6 2643 vbsrl.v vr10, \in0, 7 2644 vilvl.d vr2, vr2, \in0 2645 vilvl.d vr3, vr4, vr3 2646 vilvl.d vr4, vr6, vr5 2647 vilvl.d vr5, vr10, vr7 2648 vdp2.h.bu.b \in0, vr2, vr8 2649 vdp2.h.bu.b vr2, vr3, vr8 2650 vdp2.h.bu.b vr3, vr4, vr8 2651 vdp2.h.bu.b vr4, vr5, vr8 2652 vhaddw.d.h \in0 2653 vhaddw.d.h vr2 2654 vhaddw.d.h vr3 2655 vhaddw.d.h vr4 2656 vpickev.w \in0, vr2, \in0 2657 vpickev.w vr2, vr4, vr3 2658 vpickev.h \in0, vr2, \in0 2659 vadd.h \in0, \in0, vr9 2660.endm 2661.macro FILTER_8TAP_4W in0 2662 vbsrl.v vr10, \in0, 1 2663 vbsrl.v vr11, \in0, 2 2664 vbsrl.v vr12, \in0, 3 2665 vilvl.d vr10, vr10, \in0 2666 vilvl.d vr11, vr12, vr11 2667 vdp2.h.bu.b vr7, vr10, vr8 2668 vdp2.h.bu.b vr10, vr11, vr8 2669 vhaddw.d.h vr7 2670 vhaddw.d.h vr10 2671 vpickev.w \in0, vr10, vr7 2672.endm 2673.macro FILTER_8TAP_8W in0 2674 vbsrl.v vr10, \in0, 1 2675 vbsrl.v vr11, \in0, 2 2676 vbsrl.v vr12, \in0, 3 2677 vbsrl.v vr13, \in0, 4 2678 vbsrl.v vr14, \in0, 5 2679 vbsrl.v vr15, \in0, 6 2680 vbsrl.v vr16, \in0, 7 2681 vilvl.d vr10, vr10, \in0 2682 vilvl.d vr11, vr12, vr11 2683 vilvl.d vr12, vr14, vr13 2684 vilvl.d vr13, vr16, vr15 2685 vdp2.h.bu.b vr14, vr10, vr8 2686 vdp2.h.bu.b vr15, vr11, vr8 2687 vdp2.h.bu.b vr16, vr12, vr8 2688 vdp2.h.bu.b vr17, vr13, vr8 2689 vhaddw.d.h vr14 2690 vhaddw.d.h vr15 2691 vhaddw.d.h vr16 2692 vhaddw.d.h vr17 2693 vpickev.w vr13, vr15, vr14 2694 vpickev.w vr14, vr17, vr16 2695 vpickev.h \in0, vr14, vr13 //x0 ... x7 2696 vsrari.h \in0, \in0, 2 2697.endm 2698.macro FILTER_8TAP_8W_CLIP_STORE 2699 vdp2.w.h vr12, vr0, vr9 2700 vdp2.w.h vr13, vr1, vr9 2701 vdp2.w.h vr14, vr2, vr9 2702 vdp2.w.h vr15, vr3, vr9 2703 vdp2.w.h vr16, vr4, vr9 2704 vdp2.w.h vr17, vr5, vr9 2705 vdp2.w.h vr18, vr6, vr9 2706 vdp2.w.h vr19, vr7, vr9 2707 vhaddw.q.w vr12 2708 vhaddw.q.w vr13 2709 vhaddw.q.w vr14 2710 vhaddw.q.w vr15 2711 vhaddw.q.w vr16 2712 vhaddw.q.w vr17 2713 vhaddw.q.w vr18 2714 vhaddw.q.w vr19 2715 vpackev.w vr12, vr13, vr12 2716 vpackev.w vr13, vr15, vr14 2717 vpackev.d vr12, vr13, vr12 2718 vpackev.w vr14, vr17, vr16 2719 vpackev.w vr15, vr19, vr18 2720 vpackev.d vr13, vr15, vr14 2721 vssrarni.hu.w vr13, vr12, 10 2722 vssrani.bu.h vr13, vr13, 0 2723 vstelm.d vr13, a0, 0, 0 2724 add.d a0, a0, a1 2725.endm 2726.macro VEXTRINS_Hx8 in0 2727 vextrins.h vr0, \in0, 0x70 2728 vextrins.h vr1, \in0, 0x71 2729 vextrins.h vr2, \in0, 0x72 2730 vextrins.h vr3, \in0, 0x73 2731 vextrins.h vr4, \in0, 0x74 2732 vextrins.h vr5, \in0, 0x75 2733 vextrins.h vr6, \in0, 0x76 2734 vextrins.h vr7, \in0, 0x77 2735.endm 2736.macro VBSRL_Vx8 2737 vbsrl.v vr0, vr0, 2 2738 vbsrl.v vr1, vr1, 2 2739 vbsrl.v vr2, vr2, 2 2740 vbsrl.v vr3, vr3, 2 2741 vbsrl.v vr4, vr4, 2 2742 vbsrl.v vr5, vr5, 2 2743 vbsrl.v vr6, vr6, 2 2744 vbsrl.v vr7, vr7, 2 2745.endm 2746 2747.macro PUT_8TAP_8BPC_LSX lable 2748 li.w t0, 4 2749 la.local t6, dav1d_mc_subpel_filters 2750 slli.d t2, a3, 1 //src_stride*2 2751 add.d t3, t2, a3 //src_stride*3 2752 slli.d t4, t2, 1 //src_stride*4 2753 2754 bnez a6, .l_\lable\()put_h //mx 2755 bnez a7, .l_\lable\()put_v //my 2756 2757 clz.w t1, a4 2758 li.w t5, 24 2759 sub.w t1, t1, t5 2760 la.local t5, .l_\lable\()put_hv0_jtable 2761 alsl.d t1, t1, t5, 3 2762 ld.d t6, t1, 0 2763 add.d t5, t5, t6 2764 jirl $r0, t5, 0 2765 2766 .align 3 2767.l_\lable\()put_hv0_jtable: 2768 .dword .l_\lable\()put_hv0_128w - .l_\lable\()put_hv0_jtable 2769 .dword .l_\lable\()put_hv0_64w - .l_\lable\()put_hv0_jtable 2770 .dword .l_\lable\()put_hv0_32w - .l_\lable\()put_hv0_jtable 2771 .dword .l_\lable\()put_hv0_16w - .l_\lable\()put_hv0_jtable 2772 .dword .l_\lable\()put_hv0_8w - .l_\lable\()put_hv0_jtable 2773 .dword .l_\lable\()put_hv0_4w - .l_\lable\()put_hv0_jtable 2774 .dword .l_\lable\()put_hv0_2w - .l_\lable\()put_hv0_jtable 2775 2776.l_\lable\()put_hv0_2w: 2777 vldrepl.h vr0, a2, 0 2778 add.d a2, a2, a3 2779 vldrepl.h vr1, a2, 0 2780 vstelm.h vr0, a0, 0, 0 2781 add.d a0, a0, a1 2782 vstelm.h vr1, a0, 0, 0 2783 add.d a2, a2, a3 2784 add.d a0, a0, a1 2785 addi.w a5, a5, -2 2786 bnez a5, .l_\lable\()put_hv0_2w 2787 b .l_\lable\()end_put_8tap 2788.l_\lable\()put_hv0_4w: 2789 fld.s f0, a2, 0 2790 fldx.s f1, a2, a3 2791 fst.s f0, a0, 0 2792 fstx.s f1, a0, a1 2793 alsl.d a2, a3, a2, 1 2794 alsl.d a0, a1, a0, 1 2795 addi.w a5, a5, -2 2796 bnez a5, .l_\lable\()put_hv0_4w 2797 b .l_\lable\()end_put_8tap 2798.l_\lable\()put_hv0_8w: 2799 fld.d f0, a2, 0 2800 fldx.d f1, a2, a3 2801 fst.d f0, a0, 0 2802 fstx.d f1, a0, a1 2803 alsl.d a2, a3, a2, 1 2804 alsl.d a0, a1, a0, 1 2805 addi.w a5, a5, -2 2806 bnez a5, .l_\lable\()put_hv0_8w 2807 b .l_\lable\()end_put_8tap 2808.l_\lable\()put_hv0_16w: 2809 vld vr0, a2, 0 2810 vldx vr1, a2, a3 2811 vst vr0, a0, 0 2812 vstx vr1, a0, a1 2813 alsl.d a2, a3, a2, 1 2814 alsl.d a0, a1, a0, 1 2815 addi.w a5, a5, -2 2816 bnez a5, .l_\lable\()put_hv0_16w 2817 b .l_\lable\()end_put_8tap 2818.l_\lable\()put_hv0_32w: 2819 vld vr0, a2, 0 2820 vld vr1, a2, 16 2821 add.d a2, a2, a3 2822 vld vr2, a2, 0 2823 vld vr3, a2, 16 2824 vst vr0, a0, 0 2825 vst vr1, a0, 16 2826 add.d a0, a0, a1 2827 vst vr2, a0, 0 2828 vst vr3, a0, 16 2829 add.d a2, a2, a3 2830 add.d a0, a0, a1 2831 addi.w a5, a5, -2 2832 bnez a5, .l_\lable\()put_hv0_32w 2833 b .l_\lable\()end_put_8tap 2834.l_\lable\()put_hv0_64w: 2835 vld vr0, a2, 0 2836 vld vr1, a2, 16 2837 vld vr2, a2, 32 2838 vld vr3, a2, 48 2839 add.d a2, a2, a3 2840 vld vr4, a2, 0 2841 vld vr5, a2, 16 2842 vld vr6, a2, 32 2843 vld vr7, a2, 48 2844 add.d a2, a2, a3 2845 vst vr0, a0, 0 2846 vst vr1, a0, 16 2847 vst vr2, a0, 32 2848 vst vr3, a0, 48 2849 add.d a0, a0, a1 2850 vst vr4, a0, 0 2851 vst vr5, a0, 16 2852 vst vr6, a0, 32 2853 vst vr7, a0, 48 2854 add.d a0, a0, a1 2855 addi.w a5, a5, -2 2856 bnez a5, .l_\lable\()put_hv0_64w 2857 b .l_\lable\()end_put_8tap 2858.l_\lable\()put_hv0_128w: 2859 vld vr0, a2, 0 2860 vld vr1, a2, 16 2861 vld vr2, a2, 32 2862 vld vr3, a2, 48 2863 vld vr4, a2, 64 2864 vld vr5, a2, 80 2865 vld vr6, a2, 96 2866 vld vr7, a2, 112 2867 add.d a2, a2, a3 2868 vld vr8, a2, 0 2869 vld vr9, a2, 16 2870 vld vr10, a2, 32 2871 vld vr11, a2, 48 2872 vld vr12, a2, 64 2873 vld vr13, a2, 80 2874 vld vr14, a2, 96 2875 vld vr15, a2, 112 2876 add.d a2, a2, a3 2877 vst vr0, a0, 0 2878 vst vr1, a0, 16 2879 vst vr2, a0, 32 2880 vst vr3, a0, 48 2881 vst vr4, a0, 64 2882 vst vr5, a0, 80 2883 vst vr6, a0, 96 2884 vst vr7, a0, 112 2885 add.d a0, a0, a1 2886 vst vr8, a0, 0 2887 vst vr9, a0, 16 2888 vst vr10, a0, 32 2889 vst vr11, a0, 48 2890 vst vr12, a0, 64 2891 vst vr13, a0, 80 2892 vst vr14, a0, 96 2893 vst vr15, a0, 112 2894 add.d a0, a0, a1 2895 addi.w a5, a5, -2 2896 bnez a5, .l_\lable\()put_hv0_128w 2897 b .l_\lable\()end_put_8tap 2898 2899.l_\lable\()put_h: 2900 bnez a7, .l_\lable\()put_hv //if(fh) && if (fv) 2901 ld.d t5, sp, 0 //filter_type 2902 andi t1, t5, 3 2903 blt t0, a4, .l_\lable\()put_h_idx_fh 2904 andi t1, t5, 1 2905 addi.w t1, t1, 3 2906 2907.l_\lable\()put_h_idx_fh: 2908 addi.w t5, zero, 120 2909 mul.w t1, t1, t5 2910 addi.w t5, a6, -1 2911 slli.w t5, t5, 3 2912 add.w t1, t1, t5 2913 add.d t1, t6, t1 //fh's offset 2914 vldrepl.d vr8, t1, 0 2915 addi.d a2, a2, -3 2916 li.w t1, 34 2917 vreplgr2vr.h vr9, t1 2918 2919 clz.w t1, a4 2920 li.w t5, 24 2921 sub.w t1, t1, t5 2922 la.local t5, .l_\lable\()put_h_jtable 2923 alsl.d t1, t1, t5, 3 2924 ld.d t6, t1, 0 2925 add.d t5, t5, t6 2926 jirl $r0, t5, 0 2927 2928 .align 3 2929.l_\lable\()put_h_jtable: 2930 .dword .l_\lable\()put_h_128w - .l_\lable\()put_h_jtable 2931 .dword .l_\lable\()put_h_64w - .l_\lable\()put_h_jtable 2932 .dword .l_\lable\()put_h_32w - .l_\lable\()put_h_jtable 2933 .dword .l_\lable\()put_h_16w - .l_\lable\()put_h_jtable 2934 .dword .l_\lable\()put_h_8w - .l_\lable\()put_h_jtable 2935 .dword .l_\lable\()put_h_4w - .l_\lable\()put_h_jtable 2936 .dword .l_\lable\()put_h_2w - .l_\lable\()put_h_jtable 2937 2938.l_\lable\()put_h_2w: 2939 vld vr0, a2, 0 2940 vldx vr1, a2, a3 2941 add.d a2, a2, t2 2942 2943 vbsrl.v vr2, vr0, 1 2944 vilvl.d vr0, vr2, vr0 2945 vdp2.h.bu.b vr2, vr0, vr8 2946 vhaddw.w.h vr0, vr2, vr2 2947 vhaddw.d.w vr0, vr0, vr0 2948 vbsrl.v vr2, vr1, 1 2949 vilvl.d vr1, vr2, vr1 2950 vdp2.h.bu.b vr2, vr1, vr8 2951 vhaddw.w.h vr1, vr2, vr2 2952 vhaddw.d.w vr1, vr1, vr1 2953 vpickev.w vr0, vr1, vr0 2954 vpickev.h vr0, vr0, vr0 2955 vadd.h vr0, vr0, vr9 2956 vssrani.bu.h vr0, vr0, 6 2957 2958 vstelm.h vr0, a0, 0, 0 2959 add.d a0, a0, a1 2960 vstelm.h vr0, a0, 0, 1 2961 add.d a0, a0, a1 2962 addi.w a5, a5, -2 2963 bnez a5, .l_\lable\()put_h_2w 2964 b .l_\lable\()end_put_8tap 2965 2966.l_\lable\()put_h_4w: 2967 vld vr0, a2, 0 2968 vldx vr1, a2, a3 2969 add.d a2, a2, t2 2970 2971 vbsrl.v vr2, vr0, 1 2972 vbsrl.v vr3, vr0, 2 2973 vbsrl.v vr4, vr0, 3 2974 vilvl.d vr0, vr2, vr0 //x0 x1 2975 vilvl.d vr2, vr4, vr3 //x2 x3 2976 vdp2.h.bu.b vr3, vr0, vr8 2977 vdp2.h.bu.b vr4, vr2, vr8 2978 vhaddw.w.h vr0, vr3, vr3 2979 vhaddw.d.w vr0, vr0, vr0 2980 vhaddw.w.h vr2, vr4, vr4 2981 vhaddw.d.w vr2, vr2, vr2 2982 vpickev.w vr5, vr2, vr0 2983 vbsrl.v vr2, vr1, 1 2984 vbsrl.v vr3, vr1, 2 2985 vbsrl.v vr4, vr1, 3 2986 vilvl.d vr0, vr2, vr1 //x0 x1 2987 vilvl.d vr2, vr4, vr3 //x2 x3 2988 vdp2.h.bu.b vr3, vr0, vr8 2989 vdp2.h.bu.b vr4, vr2, vr8 2990 vhaddw.w.h vr0, vr3, vr3 2991 vhaddw.d.w vr0, vr0, vr0 2992 vhaddw.w.h vr2, vr4, vr4 2993 vhaddw.d.w vr2, vr2, vr2 2994 vpickev.w vr6, vr2, vr0 2995 vpickev.h vr0, vr6, vr5 2996 vadd.h vr0, vr0, vr9 2997 vssrani.bu.h vr0, vr0, 6 2998 2999 vstelm.w vr0, a0, 0, 0 3000 add.d a0, a0, a1 3001 vstelm.w vr0, a0, 0, 1 3002 add.d a0, a0, a1 3003 addi.d a5, a5, -2 3004 bnez a5, .l_\lable\()put_h_4w 3005 b .l_\lable\()end_put_8tap 3006 3007.l_\lable\()put_h_8w: 3008 vld vr0, a2, 0 3009 vldx vr1, a2, a3 3010 add.d a2, a2, t2 3011 PUT_H_8W vr0 3012 PUT_H_8W vr1 3013 vssrani.bu.h vr1, vr0, 6 3014 vstelm.d vr1, a0, 0, 0 3015 add.d a0, a0, a1 3016 vstelm.d vr1, a0, 0, 1 3017 add.d a0, a0, a1 3018 addi.w a5, a5, -2 3019 bnez a5, .l_\lable\()put_h_8w 3020 b .l_\lable\()end_put_8tap 3021 3022.l_\lable\()put_h_16w: 3023.l_\lable\()put_h_32w: 3024.l_\lable\()put_h_64w: 3025.l_\lable\()put_h_128w: 3026 addi.d t0, a2, 0 //src 3027 addi.w t5, a5, 0 //h 3028 addi.d t8, a0, 0 //dst 3029.l_\lable\()put_h_16w_loop: 3030 vld vr0, a2, 0 3031 vldx vr1, a2, a3 3032 add.d a2, a2, t2 3033 PUT_H_8W vr0 3034 PUT_H_8W vr1 3035 vssrani.bu.h vr1, vr0, 6 3036 vstelm.d vr1, a0, 0, 0 3037 add.d a0, a0, a1 3038 vstelm.d vr1, a0, 0, 1 3039 add.d a0, a0, a1 3040 addi.d a5, a5, -2 3041 bnez a5, .l_\lable\()put_h_16w_loop 3042 addi.d a2, t0, 8 3043 addi.d t0, t0, 8 3044 addi.d a0, t8, 8 3045 addi.d t8, t8, 8 3046 addi.w a5, t5, 0 3047 addi.w a4, a4, -8 3048 bnez a4, .l_\lable\()put_h_16w_loop 3049 b .l_\lable\()end_put_8tap 3050 3051.l_\lable\()put_v: 3052 ld.d t1, sp, 0 //filter_type 3053 srli.w t1, t1, 2 3054 blt t0, a5, .l_\lable\()put_v_idx_fv 3055 andi t1, t1, 1 3056 addi.w t1, t1, 3 3057 3058.l_\lable\()put_v_idx_fv: 3059 addi.w t5, zero, 120 3060 mul.w t1, t1, t5 3061 addi.w t5, a7, -1 3062 slli.w t5, t5, 3 3063 add.w t1, t1, t5 3064 add.d t1, t6, t1 //fv's offset 3065 vldrepl.d vr8, t1, 0 3066 sub.d a2, a2, t3 3067 3068 clz.w t1, a4 3069 li.w t5, 24 3070 sub.w t1, t1, t5 3071 la.local t5, .l_\lable\()put_v_jtable 3072 alsl.d t1, t1, t5, 3 3073 ld.d t6, t1, 0 3074 add.d t5, t5, t6 3075 jirl $r0, t5, 0 3076 3077 .align 3 3078.l_\lable\()put_v_jtable: 3079 .dword .l_\lable\()put_v_128w - .l_\lable\()put_v_jtable 3080 .dword .l_\lable\()put_v_64w - .l_\lable\()put_v_jtable 3081 .dword .l_\lable\()put_v_32w - .l_\lable\()put_v_jtable 3082 .dword .l_\lable\()put_v_16w - .l_\lable\()put_v_jtable 3083 .dword .l_\lable\()put_v_8w - .l_\lable\()put_v_jtable 3084 .dword .l_\lable\()put_v_4w - .l_\lable\()put_v_jtable 3085 .dword .l_\lable\()put_v_2w - .l_\lable\()put_v_jtable 3086 3087.l_\lable\()put_v_2w: 3088 fld.s f0, a2, 0 3089 fldx.s f1, a2, a3 3090 fldx.s f2, a2, t2 3091 add.d a2, a2, t3 3092 fld.s f3, a2, 0 3093 fldx.s f4, a2, a3 3094 fldx.s f5, a2, t2 3095 fldx.s f6, a2, t3 3096 add.d a2, a2, t4 3097 vilvl.b vr0, vr1, vr0 3098 vilvl.b vr1, vr3, vr2 3099 vilvl.b vr2, vr5, vr4 3100 vilvl.b vr3, vr7, vr6 3101 vilvl.h vr0, vr1, vr0 3102 vilvl.h vr1, vr3, vr2 3103 vilvl.w vr0, vr1, vr0 3104 3105.l_\lable\()put_v_2w_loop: 3106 fld.s f7, a2, 0 //h0 3107 fldx.s f10, a2, a3 //h1 3108 add.d a2, a2, t2 3109 3110 vextrins.b vr0, vr7, 0x70 3111 vextrins.b vr0, vr7, 0xf1 3112 vbsrl.v vr1, vr0, 1 3113 vextrins.b vr1, vr10, 0x70 3114 vextrins.b vr1, vr10, 0xf1 3115 vdp2.h.bu.b vr10, vr0, vr8 3116 vdp2.h.bu.b vr11, vr1, vr8 3117 vbsrl.v vr0, vr1, 1 3118 vhaddw.d.h vr10 3119 vhaddw.d.h vr11 3120 vpickev.w vr10, vr11, vr10 3121 vssrarni.hu.w vr10, vr10, 6 3122 vssrani.bu.h vr10, vr10, 0 3123 3124 vstelm.h vr10, a0, 0, 0 3125 add.d a0, a0, a1 3126 vstelm.h vr10, a0, 0, 1 3127 add.d a0, a0, a1 3128 addi.w a5, a5, -2 3129 bnez a5, .l_\lable\()put_v_2w_loop 3130 b .l_\lable\()end_put_8tap 3131 3132.l_\lable\()put_v_4w: 3133 fld.s f0, a2, 0 3134 fldx.s f1, a2, a3 3135 fldx.s f2, a2, t2 3136 add.d a2, a2, t3 3137 fld.s f3, a2, 0 3138 fldx.s f4, a2, a3 3139 fldx.s f5, a2, t2 3140 fldx.s f6, a2, t3 3141 add.d a2, a2, t4 3142 3143 vilvl.b vr0, vr1, vr0 3144 vilvl.b vr1, vr3, vr2 3145 vilvl.b vr2, vr5, vr4 3146 vilvl.b vr3, vr7, vr6 3147 vilvl.h vr0, vr1, vr0 3148 vilvl.h vr1, vr3, vr2 3149 vilvl.w vr2, vr1, vr0 3150 vilvh.w vr3, vr1, vr0 3151 3152.l_\lable\()put_v_4w_loop: 3153 fld.s f7, a2, 0 3154 fldx.s f10, a2, a3 3155 add.d a2, a2, t2 3156 3157 vextrins.b vr2, vr7, 0x70 3158 vextrins.b vr2, vr7, 0xf1 //x0x1(h0) 3159 vbsrl.v vr4, vr2, 1 3160 vextrins.b vr4, vr10, 0x70 3161 vextrins.b vr4, vr10, 0xf1 //x0x1(h1) 3162 vdp2.h.bu.b vr11, vr2, vr8 3163 vdp2.h.bu.b vr12, vr4, vr8 3164 vbsrl.v vr2, vr4, 1 3165 3166 vextrins.b vr3, vr7, 0x72 3167 vextrins.b vr3, vr7, 0xf3 //x2x3(h0) 3168 vbsrl.v vr4, vr3, 1 3169 vextrins.b vr4, vr10, 0x72 3170 vextrins.b vr4, vr10, 0xf3 //x2x3(h1) 3171 vdp2.h.bu.b vr13, vr3, vr8 3172 vdp2.h.bu.b vr14, vr4, vr8 3173 vbsrl.v vr3, vr4, 1 3174 3175 vhaddw.d.h vr11 3176 vhaddw.d.h vr12 3177 vhaddw.d.h vr13 3178 vhaddw.d.h vr14 3179 3180 vpickev.w vr11, vr13, vr11 3181 vpickev.w vr12, vr14, vr12 3182 vpickev.h vr11, vr12, vr11 3183 vssrarni.bu.h vr11, vr11, 6 3184 vstelm.w vr11, a0, 0, 0 3185 add.d a0, a0, a1 3186 vstelm.w vr11, a0, 0, 1 3187 add.d a0, a0, a1 3188 addi.w a5, a5, -2 3189 bnez a5, .l_\lable\()put_v_4w_loop 3190 b .l_\lable\()end_put_8tap 3191 3192.l_\lable\()put_v_8w: 3193.l_\lable\()put_v_16w: 3194.l_\lable\()put_v_32w: 3195.l_\lable\()put_v_64w: 3196.l_\lable\()put_v_128w: 3197 addi.d t0, a2, 0 //src 3198 addi.d t5, a5, 0 //h 3199 addi.d t8, a0, 0 //dst 3200.l_\lable\()put_v_8w_loop0: 3201 fld.d f0, a2, 0 3202 fldx.d f1, a2, a3 3203 fldx.d f2, a2, t2 3204 add.d a2, a2, t3 3205 fld.d f3, a2, 0 3206 fldx.d f4, a2, a3 3207 fldx.d f5, a2, t2 3208 fldx.d f6, a2, t3 3209 add.d a2, a2, t4 3210 3211 vilvl.b vr0, vr1, vr0 3212 vilvl.b vr1, vr3, vr2 3213 vilvl.b vr2, vr5, vr4 3214 vilvl.b vr3, vr7, vr6 3215 vilvl.h vr4, vr1, vr0 3216 vilvh.h vr5, vr1, vr0 3217 vilvl.h vr6, vr3, vr2 3218 vilvh.h vr7, vr3, vr2 3219 vilvl.w vr0, vr6, vr4 // x0x1 3220 vilvh.w vr1, vr6, vr4 // x2x3 3221 vilvl.w vr2, vr7, vr5 // x4x5 3222 vilvh.w vr3, vr7, vr5 // x6x7 3223.l_\lable\()put_v_8w_loop: 3224 fld.d f7, a2, 0 3225 fldx.d f10, a2, a3 3226 add.d a2, a2, t2 3227 //h0 3228 vextrins.b vr0, vr7, 0x70 3229 vextrins.b vr0, vr7, 0xf1 3230 vextrins.b vr1, vr7, 0x72 3231 vextrins.b vr1, vr7, 0xf3 3232 vextrins.b vr2, vr7, 0x74 3233 vextrins.b vr2, vr7, 0xf5 3234 vextrins.b vr3, vr7, 0x76 3235 vextrins.b vr3, vr7, 0xf7 3236 vdp2.h.bu.b vr11, vr0, vr8 3237 vdp2.h.bu.b vr12, vr1, vr8 3238 vdp2.h.bu.b vr13, vr2, vr8 3239 vdp2.h.bu.b vr14, vr3, vr8 3240 vhaddw.d.h vr11 3241 vhaddw.d.h vr12 3242 vhaddw.d.h vr13 3243 vhaddw.d.h vr14 3244 vpickev.w vr11, vr12, vr11 3245 vpickev.w vr12, vr14, vr13 3246 vpickev.h vr11, vr12, vr11 3247 vssrarni.bu.h vr11, vr11, 6 3248 fst.d f11, a0, 0 3249 add.d a0, a0, a1 3250 //h1 3251 vbsrl.v vr0, vr0, 1 3252 vbsrl.v vr1, vr1, 1 3253 vbsrl.v vr2, vr2, 1 3254 vbsrl.v vr3, vr3, 1 3255 vextrins.b vr0, vr10, 0x70 3256 vextrins.b vr0, vr10, 0xf1 3257 vextrins.b vr1, vr10, 0x72 3258 vextrins.b vr1, vr10, 0xf3 3259 vextrins.b vr2, vr10, 0x74 3260 vextrins.b vr2, vr10, 0xf5 3261 vextrins.b vr3, vr10, 0x76 3262 vextrins.b vr3, vr10, 0xf7 3263 vdp2.h.bu.b vr11, vr0, vr8 3264 vdp2.h.bu.b vr12, vr1, vr8 3265 vdp2.h.bu.b vr13, vr2, vr8 3266 vdp2.h.bu.b vr14, vr3, vr8 3267 vhaddw.d.h vr11 3268 vhaddw.d.h vr12 3269 vhaddw.d.h vr13 3270 vhaddw.d.h vr14 3271 vpickev.w vr11, vr12, vr11 3272 vpickev.w vr12, vr14, vr13 3273 vpickev.h vr11, vr12, vr11 3274 vssrarni.bu.h vr11, vr11, 6 3275 fst.d f11, a0, 0 3276 add.d a0, a0, a1 3277 vbsrl.v vr0, vr0, 1 3278 vbsrl.v vr1, vr1, 1 3279 vbsrl.v vr2, vr2, 1 3280 vbsrl.v vr3, vr3, 1 3281 addi.w a5, a5, -2 3282 bnez a5, .l_\lable\()put_v_8w_loop 3283 addi.d a2, t0, 8 3284 addi.d t0, t0, 8 3285 addi.d a0, t8, 8 3286 addi.d t8, t8, 8 3287 addi.d a5, t5, 0 3288 addi.w a4, a4, -8 3289 bnez a4, .l_\lable\()put_v_8w_loop0 3290 b .l_\lable\()end_put_8tap 3291 3292.l_\lable\()put_hv: 3293 ld.d t5, sp, 0 //filter_type 3294 andi t1, t5, 3 3295 blt t0, a4, .l_\lable\()put_hv_idx_fh 3296 andi t1, t5, 1 3297 addi.w t1, t1, 3 3298.l_\lable\()put_hv_idx_fh: 3299 addi.w t5, zero, 120 3300 mul.w t1, t1, t5 3301 addi.w t5, a6, -1 3302 slli.w t5, t5, 3 3303 add.w t1, t1, t5 3304 add.d t1, t6, t1 //fh's offset 3305 vldrepl.d vr8, t1, 0 3306 ld.d t1, sp, 0 //filter_type 3307 srli.w t1, t1, 2 3308 blt t0, a5, .l_\lable\()put_hv_idx_fv 3309 andi t1, t1, 1 3310 addi.w t1, t1, 3 3311.l_\lable\()put_hv_idx_fv: 3312 addi.w t5, zero, 120 3313 mul.w t1, t1, t5 3314 addi.w t5, a7, -1 3315 slli.w t5, t5, 3 3316 add.w t1, t1, t5 3317 add.d t1, t6, t1 //fv's offset 3318 vldrepl.d vr9, t1, 0 3319 vexth.h.b vr9, vr9 3320 3321 sub.d a2, a2, t3 3322 addi.d a2, a2, -3 3323 3324 clz.w t1, a4 3325 li.w t5, 24 3326 sub.w t1, t1, t5 3327 la.local t5, .l_\lable\()put_hv_jtable 3328 alsl.d t1, t1, t5, 3 3329 ld.d t6, t1, 0 3330 add.d t5, t5, t6 3331 jirl $r0, t5, 0 3332 3333 .align 3 3334.l_\lable\()put_hv_jtable: 3335 .dword .l_\lable\()put_hv_128w - .l_\lable\()put_hv_jtable 3336 .dword .l_\lable\()put_hv_64w - .l_\lable\()put_hv_jtable 3337 .dword .l_\lable\()put_hv_32w - .l_\lable\()put_hv_jtable 3338 .dword .l_\lable\()put_hv_16w - .l_\lable\()put_hv_jtable 3339 .dword .l_\lable\()put_hv_8w - .l_\lable\()put_hv_jtable 3340 .dword .l_\lable\()put_hv_4w - .l_\lable\()put_hv_jtable 3341 .dword .l_\lable\()put_hv_2w - .l_\lable\()put_hv_jtable 3342 3343.l_\lable\()put_hv_2w: 3344 vld vr0, a2, 0 3345 vldx vr1, a2, a3 3346 vldx vr2, a2, t2 3347 add.d a2, a2, t3 3348 vld vr3, a2, 0 3349 vldx vr4, a2, a3 3350 vldx vr5, a2, t2 3351 vldx vr6, a2, t3 3352 add.d a2, a2, t4 3353 3354 vbsrl.v vr10, vr0, 1 3355 vbsrl.v vr11, vr1, 1 3356 vbsrl.v vr12, vr2, 1 3357 vbsrl.v vr13, vr3, 1 3358 vbsrl.v vr14, vr4, 1 3359 vbsrl.v vr15, vr5, 1 3360 vbsrl.v vr16, vr6, 1 3361 vilvl.d vr0, vr10, vr0 3362 vilvl.d vr1, vr11, vr1 3363 vilvl.d vr2, vr12, vr2 3364 vilvl.d vr3, vr13, vr3 3365 vilvl.d vr4, vr14, vr4 3366 vilvl.d vr5, vr15, vr5 3367 vilvl.d vr6, vr16, vr6 3368 vdp2.h.bu.b vr10, vr0, vr8 3369 vdp2.h.bu.b vr11, vr1, vr8 3370 vdp2.h.bu.b vr12, vr2, vr8 3371 vdp2.h.bu.b vr13, vr3, vr8 3372 vdp2.h.bu.b vr14, vr4, vr8 3373 vdp2.h.bu.b vr15, vr5, vr8 3374 vdp2.h.bu.b vr16, vr6, vr8 3375 vhaddw.d.h vr10 3376 vhaddw.d.h vr11 3377 vhaddw.d.h vr12 3378 vhaddw.d.h vr13 3379 vhaddw.d.h vr14 3380 vhaddw.d.h vr15 3381 vhaddw.d.h vr16 3382 3383 vpackev.w vr10, vr11, vr10 3384 vpackev.w vr12, vr13, vr12 3385 vpackod.d vr11, vr12, vr10 3386 vpackev.d vr10, vr12, vr10 3387 3388 vpackev.w vr12, vr15, vr14 3389 vpackev.w vr16, vr17, vr16 3390 vpackod.d vr13, vr16, vr12 3391 vpackev.d vr12, vr16, vr12 3392 3393 vpickev.h vr10, vr12, vr10 //0 1 2 3 4 5 6 * (h0) 3394 vpickev.h vr11, vr13, vr11 //8 9 10 11 12 13 14 * (h1) 3395 vsrari.h vr10, vr10, 2 3396 vsrari.h vr11, vr11, 2 3397.l_\lable\()put_hv_2w_loop: 3398 vld vr7, a2, 0 3399 vldx vr12, a2, a3 3400 add.d a2, a2, t2 3401 3402 vbsrl.v vr1, vr7, 1 3403 vbsrl.v vr2, vr12, 1 3404 vilvl.d vr0, vr1, vr7 3405 vilvl.d vr1, vr2, vr12 3406 vdp2.h.bu.b vr2, vr0, vr8 3407 vdp2.h.bu.b vr3, vr1, vr8 3408 vhaddw.d.h vr2 3409 vhaddw.d.h vr3 3410 vpickev.w vr2, vr3, vr2 3411 vpickev.h vr2, vr2, vr2 3412 vsrari.h vr2, vr2, 2 3413 vextrins.h vr10, vr2, 0x70 //0 1 2 3 4 5 6 7 3414 vextrins.h vr11, vr2, 0x71 3415 vbsrl.v vr12, vr10, 2 3416 vbsrl.v vr13, vr11, 2 3417 vextrins.h vr12, vr2, 0x72 //1 2 3 4 5 6 7 8 3418 vextrins.h vr13, vr2, 0x73 3419 vdp2.w.h vr0, vr10, vr9 3420 vdp2.w.h vr1, vr11, vr9 3421 vdp2.w.h vr2, vr12, vr9 3422 vdp2.w.h vr3, vr13, vr9 3423 vhaddw.q.w vr0 3424 vhaddw.q.w vr1 3425 vhaddw.q.w vr2 3426 vhaddw.q.w vr3 3427 vpackev.w vr0, vr1, vr0 3428 vpackev.w vr1, vr3, vr2 3429 vpackev.d vr0, vr1, vr0 3430 vssrarni.hu.w vr0, vr0, 10 3431 vssrani.bu.h vr0, vr0, 0 3432 vbsrl.v vr10, vr12, 2 3433 vbsrl.v vr11, vr13, 2 3434 vstelm.h vr0, a0, 0, 0 3435 add.d a0, a0, a1 3436 vstelm.h vr0, a0, 0, 1 3437 add.d a0, a0, a1 3438 addi.d a5, a5, -2 3439 bnez a5, .l_\lable\()put_hv_2w_loop 3440 b .l_\lable\()end_put_8tap 3441 3442.l_\lable\()put_hv_4w: 3443 vld vr0, a2, 0 3444 vldx vr1, a2, a3 3445 vldx vr2, a2, t2 3446 add.d a2, a2, t3 3447 vld vr3, a2, 0 3448 vldx vr4, a2, a3 3449 vldx vr5, a2, t2 3450 vldx vr6, a2, t3 3451 add.d a2, a2, t4 3452 FILTER_8TAP_4W vr0 //x0 x1 x2 x3 3453 FILTER_8TAP_4W vr1 3454 FILTER_8TAP_4W vr2 3455 FILTER_8TAP_4W vr3 3456 FILTER_8TAP_4W vr4 3457 FILTER_8TAP_4W vr5 3458 FILTER_8TAP_4W vr6 3459 vpackev.h vr0, vr1, vr0 3460 vpackev.h vr1, vr3, vr2 3461 vpackev.h vr2, vr5, vr4 3462 vpackev.h vr3, vr7, vr6 3463 vilvl.w vr4, vr1, vr0 3464 vilvh.w vr5, vr1, vr0 3465 vilvl.w vr6, vr3, vr2 3466 vilvh.w vr7, vr3, vr2 3467 vilvl.d vr0, vr6, vr4 //0 1 2 3 4 5 6 * 3468 vilvh.d vr1, vr6, vr4 3469 vilvl.d vr2, vr7, vr5 3470 vilvh.d vr3, vr7, vr5 3471 vsrari.h vr0, vr0, 2 3472 vsrari.h vr1, vr1, 2 3473 vsrari.h vr2, vr2, 2 3474 vsrari.h vr3, vr3, 2 3475.l_\lable\()put_hv_4w_loop: 3476 vld vr4, a2, 0 3477 vldx vr5, a2, a3 3478 add.d a2, a2, t2 3479 FILTER_8TAP_4W vr4 3480 FILTER_8TAP_4W vr5 3481 vpickev.h vr4, vr5, vr4 3482 vsrari.h vr4, vr4, 2 3483 vextrins.h vr0, vr4, 0x70 3484 vextrins.h vr1, vr4, 0x71 3485 vextrins.h vr2, vr4, 0x72 3486 vextrins.h vr3, vr4, 0x73 3487 vbsrl.v vr5, vr0, 2 3488 vbsrl.v vr6, vr1, 2 3489 vbsrl.v vr7, vr2, 2 3490 vbsrl.v vr10, vr3, 2 3491 vextrins.h vr5, vr4, 0x74 3492 vextrins.h vr6, vr4, 0x75 3493 vextrins.h vr7, vr4, 0x76 3494 vextrins.h vr10, vr4, 0x77 3495 vdp2.w.h vr11, vr0, vr9 3496 vdp2.w.h vr12, vr1, vr9 3497 vdp2.w.h vr13, vr2, vr9 3498 vdp2.w.h vr14, vr3, vr9 3499 vhaddw.q.w vr11 3500 vhaddw.q.w vr12 3501 vhaddw.q.w vr13 3502 vhaddw.q.w vr14 3503 vpackev.w vr0, vr12, vr11 3504 vpackev.w vr1, vr14, vr13 3505 vpackev.d vr0, vr1, vr0 3506 vdp2.w.h vr11, vr5, vr9 3507 vdp2.w.h vr12, vr6, vr9 3508 vdp2.w.h vr13, vr7, vr9 3509 vdp2.w.h vr14, vr10, vr9 3510 vhaddw.q.w vr11 3511 vhaddw.q.w vr12 3512 vhaddw.q.w vr13 3513 vhaddw.q.w vr14 3514 vpackev.w vr1, vr12, vr11 3515 vpackev.w vr2, vr14, vr13 3516 vpackev.d vr1, vr2, vr1 3517 vssrarni.hu.w vr1, vr0, 10 3518 vssrani.bu.h vr1, vr1, 0 3519 vstelm.w vr1, a0, 0, 0 3520 add.d a0, a0, a1 3521 vstelm.w vr1, a0, 0, 1 3522 add.d a0, a0, a1 3523 vbsrl.v vr0, vr5, 2 3524 vbsrl.v vr1, vr6, 2 3525 vbsrl.v vr2, vr7, 2 3526 vbsrl.v vr3, vr10, 2 3527 addi.w a5, a5, -2 3528 bnez a5, .l_\lable\()put_hv_4w_loop 3529 b .l_\lable\()end_put_8tap 3530 3531.l_\lable\()put_hv_8w: 3532.l_\lable\()put_hv_16w: 3533.l_\lable\()put_hv_32w: 3534.l_\lable\()put_hv_64w: 3535.l_\lable\()put_hv_128w: 3536 addi.d t0, a2, 0 //src 3537 addi.d t5, a5, 0 //h 3538 addi.d t8, a0, 0 //dst 3539.l_\lable\()put_hv_8w_loop0: 3540 vld vr0, a2, 0 3541 vldx vr1, a2, a3 3542 vldx vr2, a2, t2 3543 add.d a2, a2, t3 3544 vld vr3, a2, 0 3545 vldx vr4, a2, a3 3546 vldx vr5, a2, t2 3547 vldx vr6, a2, t3 3548 add.d a2, a2, t4 3549 FILTER_8TAP_8W vr0 3550 FILTER_8TAP_8W vr1 3551 FILTER_8TAP_8W vr2 3552 FILTER_8TAP_8W vr3 3553 FILTER_8TAP_8W vr4 3554 FILTER_8TAP_8W vr5 3555 FILTER_8TAP_8W vr6 3556 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\ 3557 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\ 3558 vr10,vr11,vr12,vr13,vr14,vr15,vr16,vr17 3559.l_\lable\()put_hv_8w_loop: 3560 vld vr20, a2, 0 3561 vldx vr21, a2, a3 3562 add.d a2, a2, t2 3563 FILTER_8TAP_8W vr20 3564 FILTER_8TAP_8W vr21 3565 VEXTRINS_Hx8 vr20 3566 FILTER_8TAP_8W_CLIP_STORE 3567 VBSRL_Vx8 3568 VEXTRINS_Hx8 vr21 3569 FILTER_8TAP_8W_CLIP_STORE 3570 VBSRL_Vx8 3571 addi.w a5, a5, -2 3572 bnez a5, .l_\lable\()put_hv_8w_loop 3573 addi.d a2, t0, 8 3574 addi.d t0, t0, 8 3575 addi.d a0, t8, 8 3576 addi.d t8, t8, 8 3577 addi.d a5, t5, 0 3578 addi.w a4, a4, -8 3579 bnez a4, .l_\lable\()put_hv_8w_loop0 3580.l_\lable\()end_put_8tap: 3581.endm 3582 3583function put_8tap_regular_8bpc_lsx 3584 addi.d sp, sp, -16 3585 st.d zero, sp, 0 3586 PUT_8TAP_8BPC_LSX 0 3587 addi.d sp, sp, 16 3588endfunc 3589 3590function put_8tap_smooth_regular_8bpc_lsx 3591 addi.d sp, sp, -16 3592 li.w t0, 1 3593 st.d t0, sp, 0 3594 PUT_8TAP_8BPC_LSX 1 3595 addi.d sp, sp, 16 3596endfunc 3597 3598function put_8tap_sharp_regular_8bpc_lsx 3599 addi.d sp, sp, -16 3600 li.w t0, 2 3601 st.d t0, sp, 0 3602 PUT_8TAP_8BPC_LSX 2 3603 addi.d sp, sp, 16 3604endfunc 3605 3606function put_8tap_regular_smooth_8bpc_lsx 3607 addi.d sp, sp, -16 3608 li.w t0, 4 3609 st.d t0, sp, 0 3610 PUT_8TAP_8BPC_LSX 4 3611 addi.d sp, sp, 16 3612endfunc 3613 3614function put_8tap_smooth_8bpc_lsx 3615 addi.d sp, sp, -16 3616 li.w t0, 5 3617 st.d t0, sp, 0 3618 PUT_8TAP_8BPC_LSX 5 3619 addi.d sp, sp, 16 3620endfunc 3621 3622function put_8tap_sharp_smooth_8bpc_lsx 3623 addi.d sp, sp, -16 3624 li.w t0, 6 3625 st.d t0, sp, 0 3626 PUT_8TAP_8BPC_LSX 6 3627 addi.d sp, sp, 16 3628endfunc 3629 3630function put_8tap_regular_sharp_8bpc_lsx 3631 addi.d sp, sp, -16 3632 li.w t0, 8 3633 st.d t0, sp, 0 3634 PUT_8TAP_8BPC_LSX 8 3635 addi.d sp, sp, 16 3636endfunc 3637 3638function put_8tap_smooth_sharp_8bpc_lsx 3639 addi.d sp, sp, -16 3640 li.w t0, 9 3641 st.d t0, sp, 0 3642 PUT_8TAP_8BPC_LSX 9 3643 addi.d sp, sp, 16 3644endfunc 3645 3646function put_8tap_sharp_8bpc_lsx 3647 addi.d sp, sp, -16 3648 li.w t0, 10 3649 st.d t0, sp, 0 3650 PUT_8TAP_8BPC_LSX 10 3651 addi.d sp, sp, 16 3652endfunc 3653 3654const shufb1 3655.byte 0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8,0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8 3656endconst 3657 3658.macro SHUFB in0, in1, tmp, out 3659 xvbsrl.v \tmp, \in0, 2 3660 xvpermi.q \tmp, \in0, 0x20 3661 xvshuf.b \out, \tmp, \tmp, \in1 3662.endm 3663 3664.macro HADDWDH in0 3665 xvhaddw.w.h \in0, \in0, \in0 3666 xvhaddw.d.w \in0, \in0, \in0 3667.endm 3668 3669.macro HADDWQW in0 3670 xvhaddw.d.w \in0, \in0, \in0 3671 xvhaddw.q.d \in0, \in0, \in0 3672.endm 3673 3674.macro PREP_W16_H in0 3675 xvbsrl.v xr4, \in0, 4 3676 xvbsrl.v xr5, \in0, 8 3677 xvpermi.q xr9, \in0, 0x31 3678 xvpackev.d xr5, xr9, xr5 3679 xvbsrl.v xr6, xr5, 4 3680 SHUFB \in0, xr23, xr9, \in0 3681 SHUFB xr4, xr23, xr9, xr4 3682 SHUFB xr5, xr23, xr9, xr5 3683 SHUFB xr6, xr23, xr9, xr6 3684 xvdp2.h.bu.b xr10, \in0, xr22 3685 xvdp2.h.bu.b xr11, xr4, xr22 3686 xvdp2.h.bu.b xr12, xr5, xr22 3687 xvdp2.h.bu.b xr13, xr6, xr22 3688 HADDWDH xr10 3689 HADDWDH xr11 3690 HADDWDH xr12 3691 HADDWDH xr13 3692 xvpickev.w xr10, xr11, xr10 3693 xvpickev.w xr11, xr13, xr12 3694 xvpermi.d xr10, xr10, 0xd8 3695 xvpermi.d xr11, xr11, 0xd8 3696 xvpickev.h xr10, xr11, xr10 3697 xvpermi.d xr10, xr10, 0xd8 3698 xvsrari.h \in0, xr10, 2 3699.endm 3700 3701.macro PREP_8TAP_8BPC_LASX lable 3702 li.w t0, 4 3703 la.local t6, dav1d_mc_subpel_filters 3704 la.local t7, shufb1 3705 xvld xr23, t7, 0 3706 slli.d t2, a2, 1 //src_stride*2 3707 add.d t3, t2, a2 //src_stride*3 3708 slli.d t4, t2, 1 3709 3710 bnez a5, .l_\lable\()h //mx 3711 bnez a6, .l_\lable\()v 3712 3713 clz.w t1, a3 3714 li.w t5, 24 3715 sub.w t1, t1, t5 3716 la.local t5, .l_\lable\()prep_hv0_jtable 3717 alsl.d t1, t1, t5, 1 3718 ld.h t8, t1, 0 3719 add.d t5, t5, t8 3720 jirl $r0, t5, 0 3721 3722 .align 3 3723.l_\lable\()prep_hv0_jtable: 3724 .hword .l_\lable\()hv0_128w - .l_\lable\()prep_hv0_jtable 3725 .hword .l_\lable\()hv0_64w - .l_\lable\()prep_hv0_jtable 3726 .hword .l_\lable\()hv0_32w - .l_\lable\()prep_hv0_jtable 3727 .hword .l_\lable\()hv0_16w - .l_\lable\()prep_hv0_jtable 3728 .hword .l_\lable\()hv0_8w - .l_\lable\()prep_hv0_jtable 3729 .hword .l_\lable\()hv0_4w - .l_\lable\()prep_hv0_jtable 3730 3731.l_\lable\()hv0_4w: 3732 fld.s f0, a1, 0 3733 fldx.s f1, a1, a2 3734 fldx.s f2, a1, t2 3735 fldx.s f3, a1, t3 3736 add.d a1, a1, t4 3737 xvpackev.w xr0, xr1, xr0 3738 xvpackev.w xr1, xr3, xr2 3739 xvpermi.q xr0, xr1, 0x02 3740 xvsllwil.hu.bu xr0, xr0, 4 3741 xvst xr0, a0, 0 3742 addi.d a0, a0, 32 3743 addi.d a4, a4, -4 3744 bnez a4, .l_\lable\()hv0_4w 3745 b .l_\lable\()end_pre_8tap 3746.l_\lable\()hv0_8w: 3747 fld.d f0, a1, 0 3748 fldx.d f1, a1, a2 3749 fldx.d f2, a1, t2 3750 fldx.d f3, a1, t3 3751 add.d a1, a1, t4 3752 xvpermi.q xr0, xr1, 0x02 3753 xvpermi.q xr2, xr3, 0x02 3754 xvsllwil.hu.bu xr0, xr0, 4 3755 xvsllwil.hu.bu xr2, xr2, 4 3756 xvst xr0, a0, 0 3757 xvst xr2, a0, 32 3758 addi.d a0, a0, 64 3759 addi.d a4, a4, -4 3760 bnez a4, .l_\lable\()hv0_8w 3761 b .l_\lable\()end_pre_8tap 3762.l_\lable\()hv0_16w: 3763 vld vr0, a1, 0 3764 vldx vr1, a1, a2 3765 vldx vr2, a1, t2 3766 vldx vr3, a1, t3 3767 add.d a1, a1, t4 3768 vext2xv.hu.bu xr0, xr0 3769 vext2xv.hu.bu xr1, xr1 3770 vext2xv.hu.bu xr2, xr2 3771 vext2xv.hu.bu xr3, xr3 3772 xvslli.h xr0, xr0, 4 3773 xvslli.h xr1, xr1, 4 3774 xvslli.h xr2, xr2, 4 3775 xvslli.h xr3, xr3, 4 3776 xvst xr0, a0, 0 3777 xvst xr1, a0, 32 3778 xvst xr2, a0, 64 3779 xvst xr3, a0, 96 3780 addi.d a0, a0, 128 3781 addi.d a4, a4, -4 3782 bnez a4, .l_\lable\()hv0_16w 3783 b .l_\lable\()end_pre_8tap 3784.l_\lable\()hv0_32w: 3785 xvld xr0, a1, 0 3786 xvldx xr1, a1, a2 3787 xvldx xr2, a1, t2 3788 xvldx xr3, a1, t3 3789 add.d a1, a1, t4 3790 xvpermi.d xr4, xr0, 0xD8 3791 xvpermi.d xr5, xr1, 0xD8 3792 xvpermi.d xr6, xr2, 0xD8 3793 xvpermi.d xr7, xr3, 0xD8 3794 xvpermi.d xr10, xr0, 0x32 3795 xvpermi.d xr11, xr1, 0x32 3796 xvpermi.d xr12, xr2, 0x32 3797 xvpermi.d xr13, xr3, 0x32 3798 xvsllwil.hu.bu xr0, xr4, 4 3799 xvsllwil.hu.bu xr1, xr5, 4 3800 xvsllwil.hu.bu xr2, xr6, 4 3801 xvsllwil.hu.bu xr3, xr7, 4 3802 xvsllwil.hu.bu xr4, xr10, 4 3803 xvsllwil.hu.bu xr5, xr11, 4 3804 xvsllwil.hu.bu xr6, xr12, 4 3805 xvsllwil.hu.bu xr7, xr13, 4 3806 xvst xr0, a0, 0 3807 xvst xr4, a0, 32 3808 xvst xr1, a0, 64 3809 xvst xr5, a0, 96 3810 xvst xr2, a0, 128 3811 xvst xr6, a0, 160 3812 xvst xr3, a0, 192 3813 xvst xr7, a0, 224 3814 addi.d a0, a0, 256 3815 addi.d a4, a4, -4 3816 bnez a4, .l_\lable\()hv0_32w 3817 b .l_\lable\()end_pre_8tap 3818.l_\lable\()hv0_64w: 3819.l_\lable\()hv0_128w: 3820 addi.d t0, a1, 0 3821 addi.d t5, a4, 0 3822 srli.w t7, a3, 5 3823 slli.w t7, t7, 6 3824 addi.d t8, a0, 0 3825.l_\lable\()hv0_32_loop: 3826 xvld xr0, a1, 0 3827 xvldx xr1, a1, a2 3828 xvldx xr2, a1, t2 3829 xvldx xr3, a1, t3 3830 add.d a1, a1, t4 3831 xvpermi.d xr4, xr0, 0xD8 3832 xvpermi.d xr5, xr1, 0xD8 3833 xvpermi.d xr6, xr2, 0xD8 3834 xvpermi.d xr7, xr3, 0xD8 3835 xvpermi.d xr10, xr0, 0x32 3836 xvpermi.d xr11, xr1, 0x32 3837 xvpermi.d xr12, xr2, 0x32 3838 xvpermi.d xr13, xr3, 0x32 3839 xvsllwil.hu.bu xr0, xr4, 4 3840 xvsllwil.hu.bu xr1, xr5, 4 3841 xvsllwil.hu.bu xr2, xr6, 4 3842 xvsllwil.hu.bu xr3, xr7, 4 3843 xvsllwil.hu.bu xr4, xr10, 4 3844 xvsllwil.hu.bu xr5, xr11, 4 3845 xvsllwil.hu.bu xr6, xr12, 4 3846 xvsllwil.hu.bu xr7, xr13, 4 3847 xvst xr0, a0, 0 3848 xvst xr4, a0, 32 3849 add.d t1, a0, t7 3850 xvst xr1, t1, 0 3851 xvst xr5, t1, 32 3852 add.d t1, t1, t7 3853 xvst xr2, t1, 0 3854 xvst xr6, t1, 32 3855 add.d t1, t1, t7 3856 xvst xr3, t1, 0 3857 xvst xr7, t1, 32 3858 add.d a0, t1, t7 3859 addi.d a4, a4, -4 3860 bnez a4, .l_\lable\()hv0_32_loop 3861 addi.d a1, t0, 32 3862 addi.d t0, t0, 32 3863 addi.d a0, t8, 64 3864 addi.d t8, t8, 64 3865 addi.d a4, t5, 0 3866 addi.d a3, a3, -32 3867 bnez a3, .l_\lable\()hv0_32_loop 3868 b .l_\lable\()end_pre_8tap 3869 3870.l_\lable\()h: 3871 bnez a6, .l_\lable\()hv //if(fh) && if (fv) 3872 3873 andi t1, a7, 3 3874 blt t0, a3, .l_\lable\()h_idx_fh 3875 andi t1, a7, 1 3876 addi.w t1, t1, 3 3877.l_\lable\()h_idx_fh: 3878 addi.w t5, zero, 120 3879 mul.w t1, t1, t5 3880 addi.w t5, a5, -1 3881 slli.w t5, t5, 3 3882 add.w t1, t1, t5 3883 add.d t1, t6, t1 //fh's offset 3884 xvldrepl.d xr22, t1, 0 3885 3886 addi.d a1, a1, -3 3887 clz.w t1, a3 3888 li.w t5, 24 3889 sub.w t1, t1, t5 3890 la.local t5, .l_\lable\()prep_h_jtable 3891 alsl.d t1, t1, t5, 1 3892 ld.h t8, t1, 0 3893 add.d t5, t5, t8 3894 jirl $r0, t5, 0 3895 3896 .align 3 3897.l_\lable\()prep_h_jtable: 3898 .hword .l_\lable\()h_128w - .l_\lable\()prep_h_jtable 3899 .hword .l_\lable\()h_64w - .l_\lable\()prep_h_jtable 3900 .hword .l_\lable\()h_32w - .l_\lable\()prep_h_jtable 3901 .hword .l_\lable\()h_16w - .l_\lable\()prep_h_jtable 3902 .hword .l_\lable\()h_8w - .l_\lable\()prep_h_jtable 3903 .hword .l_\lable\()h_4w - .l_\lable\()prep_h_jtable 3904 3905.l_\lable\()h_4w: 3906 xvld xr0, a1, 0 3907 xvldx xr1, a1, a2 3908 xvldx xr2, a1, t2 3909 xvldx xr3, a1, t3 3910 add.d a1, a1, t4 3911 3912 SHUFB xr0, xr23, xr9, xr0 3913 SHUFB xr1, xr23, xr9, xr1 3914 SHUFB xr2, xr23, xr9, xr2 3915 SHUFB xr3, xr23, xr9, xr3 3916 3917 xvdp2.h.bu.b xr10, xr0, xr22 3918 xvdp2.h.bu.b xr12, xr1, xr22 3919 xvdp2.h.bu.b xr14, xr2, xr22 3920 xvdp2.h.bu.b xr16, xr3, xr22 3921 3922 HADDWDH xr10 //h0 mid0 mid1 mid2 mid3 3923 HADDWDH xr12 //h1 mid4 mid5 mid6 mid7 3924 HADDWDH xr14 //h2 3925 HADDWDH xr16 //h3 3926 3927 xvpickev.w xr10, xr12, xr10 3928 xvpickev.w xr14, xr16, xr14 3929 xvpermi.d xr10, xr10, 0xd8 3930 xvpermi.d xr14, xr14, 0xd8 3931 xvpickev.h xr10, xr14, xr10 3932 xvpermi.d xr10, xr10, 0xd8 3933 xvsrari.h xr10, xr10, 2 3934 3935 xvst xr10, a0, 0 3936 addi.d a0, a0, 32 3937 addi.w a4, a4, -4 3938 bnez a4, .l_\lable\()h_4w 3939 b .l_\lable\()end_pre_8tap 3940 3941.l_\lable\()h_8w: 3942 xvld xr0, a1, 0 3943 xvldx xr2, a1, a2 3944 xvldx xr4, a1, t2 3945 xvldx xr6, a1, t3 3946 add.d a1, a1, t4 3947 3948 xvbsrl.v xr1, xr0, 4 3949 xvbsrl.v xr3, xr2, 4 3950 xvbsrl.v xr5, xr4, 4 3951 xvbsrl.v xr7, xr6, 4 3952 3953 SHUFB xr0, xr23, xr9, xr10 3954 SHUFB xr1, xr23, xr9, xr11 3955 SHUFB xr2, xr23, xr9, xr12 3956 SHUFB xr3, xr23, xr9, xr13 3957 SHUFB xr4, xr23, xr9, xr14 3958 SHUFB xr5, xr23, xr9, xr15 3959 SHUFB xr6, xr23, xr9, xr16 3960 SHUFB xr7, xr23, xr9, xr17 3961 3962 xvdp2.h.bu.b xr0, xr10, xr22 3963 xvdp2.h.bu.b xr1, xr11, xr22 3964 xvdp2.h.bu.b xr2, xr12, xr22 3965 xvdp2.h.bu.b xr3, xr13, xr22 3966 xvdp2.h.bu.b xr4, xr14, xr22 3967 xvdp2.h.bu.b xr5, xr15, xr22 3968 xvdp2.h.bu.b xr6, xr16, xr22 3969 xvdp2.h.bu.b xr7, xr17, xr22 3970 3971 HADDWDH xr0 3972 HADDWDH xr1 3973 HADDWDH xr2 3974 HADDWDH xr3 3975 HADDWDH xr4 3976 HADDWDH xr5 3977 HADDWDH xr6 3978 HADDWDH xr7 3979 3980 xvpickev.w xr0, xr1, xr0 3981 xvpickev.w xr2, xr3, xr2 3982 xvpermi.d xr0, xr0, 0xd8 3983 xvpermi.d xr2, xr2, 0xd8 3984 xvpickev.h xr0, xr2, xr0 3985 xvpermi.d xr0, xr0, 0xd8 3986 xvsrari.h xr0, xr0, 2 3987 3988 xvpickev.w xr4, xr5, xr4 3989 xvpickev.w xr6, xr7, xr6 3990 xvpermi.d xr4, xr4, 0xd8 3991 xvpermi.d xr6, xr6, 0xd8 3992 xvpickev.h xr4, xr6, xr4 3993 xvpermi.d xr4, xr4, 0xd8 3994 xvsrari.h xr4, xr4, 2 3995 3996 xvst xr0, a0, 0 3997 xvst xr4, a0, 32 3998 addi.d a0, a0, 64 3999 addi.d a4, a4, -4 4000 bnez a4, .l_\lable\()h_8w 4001 b .l_\lable\()end_pre_8tap 4002 4003.l_\lable\()h_16w: 4004 xvld xr0, a1, 0 4005 xvldx xr1, a1, a2 4006 xvldx xr2, a1, t2 4007 xvldx xr3, a1, t3 4008 add.d a1, a1, t4 4009 4010 PREP_W16_H xr0 4011 PREP_W16_H xr1 4012 PREP_W16_H xr2 4013 PREP_W16_H xr3 4014 4015 xvst xr0, a0, 0 4016 xvst xr1, a0, 32 4017 xvst xr2, a0, 64 4018 xvst xr3, a0, 96 4019 4020 addi.d a0, a0, 128 4021 addi.w a4, a4, -4 4022 bnez a4, .l_\lable\()h_16w 4023 b .l_\lable\()end_pre_8tap 4024 4025.l_\lable\()h_32w: 4026.l_\lable\()h_64w: 4027.l_\lable\()h_128w: 4028 addi.d t0, a1, 0 //src 4029 addi.d t5, a4, 0 //h 4030 srli.w t7, a3, 4 //w 4031 slli.w t7, t7, 5 //store offset 4032 addi.d t8, a0, 0 //dst 4033.l_\lable\()h_16_loop: 4034 xvld xr0, a1, 0 4035 xvldx xr1, a1, a2 4036 xvldx xr2, a1, t2 4037 xvldx xr3, a1, t3 4038 add.d a1, a1, t4 4039 4040 PREP_W16_H xr0 4041 PREP_W16_H xr1 4042 PREP_W16_H xr2 4043 PREP_W16_H xr3 4044 4045 xvst xr0, a0, 0 4046 xvstx xr1, a0, t7 4047 slli.w t1, t7, 1 4048 xvstx xr2, a0, t1 4049 add.w t1, t1, t7 4050 xvstx xr3, a0, t1 4051 slli.w t1, t7, 2 4052 add.d a0, a0, t1 4053 addi.d a4, a4, -4 4054 bnez a4, .l_\lable\()h_16_loop 4055 4056 addi.d a1, t0, 16 4057 addi.d t0, t0, 16 4058 addi.d a0, t8, 32 4059 addi.d t8, t8, 32 4060 addi.d a4, t5, 0 4061 addi.d a3, a3, -16 4062 bnez a3, .l_\lable\()h_16_loop 4063 b .l_\lable\()end_pre_8tap 4064.l_\lable\()hv: 4065 andi t1, a7, 3 4066 blt t0, a3, .l_\lable\()hv_idx_fh 4067 andi t1, a7, 1 4068 addi.w t1, t1, 3 4069.l_\lable\()hv_idx_fh: 4070 addi.w t5, zero, 120 4071 mul.w t1, t1, t5 4072 addi.w t5, a5, -1 4073 slli.w t5, t5, 3 4074 add.w t1, t1, t5 4075 add.d t1, t6, t1 //fh's offset 4076 xvldrepl.d xr22, t1, 0 4077 srli.w a7, a7, 2 4078 blt t0, a4, .l_\lable\()hv_idx_fv 4079 andi a7, a7, 1 4080 addi.w a7, a7, 3 4081.l_\lable\()hv_idx_fv: 4082 addi.w t5, zero, 120 4083 mul.w a7, a7, t5 4084 addi.w t5, a6, -1 4085 slli.w t5, t5, 3 4086 add.w a7, a7, t5 4087 add.d a7, t6, a7 //fv's offset 4088 xvldrepl.d xr8, a7, 0 4089 xvsllwil.h.b xr8, xr8, 0 4090 4091 sub.d a1, a1, t3 4092 addi.d a1, a1, -3 4093 beq a3, t0, .l_\lable\()hv_4w 4094 b .l_\lable\()hv_8w 4095.l_\lable\()hv_4w: 4096 xvld xr0, a1, 0 4097 xvldx xr1, a1, a2 4098 xvldx xr2, a1, t2 4099 xvldx xr3, a1, t3 4100 add.d a1, a1, t4 4101 xvld xr4, a1, 0 4102 xvldx xr5, a1, a2 4103 xvldx xr6, a1, t2 4104 4105 SHUFB xr0, xr23, xr9, xr0 4106 SHUFB xr1, xr23, xr9, xr1 4107 SHUFB xr2, xr23, xr9, xr2 4108 SHUFB xr3, xr23, xr9, xr3 4109 4110 SHUFB xr4, xr23, xr9, xr4 4111 SHUFB xr5, xr23, xr9, xr5 4112 SHUFB xr6, xr23, xr9, xr6 4113 4114 xvdp2.h.bu.b xr10, xr0, xr22 4115 xvdp2.h.bu.b xr11, xr1, xr22 4116 xvdp2.h.bu.b xr12, xr2, xr22 4117 xvdp2.h.bu.b xr13, xr3, xr22 4118 4119 xvdp2.h.bu.b xr14, xr4, xr22 4120 xvdp2.h.bu.b xr15, xr5, xr22 4121 xvdp2.h.bu.b xr16, xr6, xr22 4122 4123 HADDWDH xr10 //h0 mid0 mid1 mid2 mid3 4124 HADDWDH xr11 //h1 mid4 mid5 mid6 mid7 4125 HADDWDH xr12 //h2 4126 HADDWDH xr13 //h3 4127 4128 xvpackev.w xr10, xr11, xr10 4129 xvpackev.w xr12, xr13, xr12 4130 xvpackev.d xr11, xr12, xr10 4131 xvpackod.d xr10, xr12, xr10 4132 xvpickev.h xr11, xr10, xr11 4133 xvsrari.h xr11, xr11, 2 4134 4135 HADDWDH xr14 //h4 4136 HADDWDH xr15 //h5 4137 HADDWDH xr16 //h6 4138 4139 xvpackev.w xr14, xr15, xr14 4140 xvpackev.w xr16, xr17, xr16 4141 xvpackev.d xr17, xr16, xr14 4142 xvpackod.d xr14, xr16, xr14 4143 xvpickev.h xr13, xr14, xr17 4144 xvsrari.h xr13, xr13, 2 4145 4146 xvpackev.d xr18, xr13, xr11 //0 4 8 12 16 20 24 * 2 6 10 14 18 22 26 * 4147 xvpackod.d xr19, xr13, xr11 //1 5 9 13 17 21 25 * 3 7 11 15 19 23 27 * 4148.l_\lable\()hv_w4_loop: 4149 xvldx xr0, a1, t3 4150 add.d a1, a1, t4 4151 xvld xr1, a1, 0 4152 xvldx xr2, a1, a2 4153 xvldx xr3, a1, t2 4154 4155 SHUFB xr0, xr23, xr9, xr0 4156 SHUFB xr1, xr23, xr9, xr1 4157 SHUFB xr2, xr23, xr9, xr2 4158 SHUFB xr3, xr23, xr9, xr3 4159 4160 xvdp2.h.bu.b xr10, xr0, xr22 4161 xvdp2.h.bu.b xr12, xr1, xr22 4162 xvdp2.h.bu.b xr14, xr2, xr22 4163 xvdp2.h.bu.b xr16, xr3, xr22 4164 4165 HADDWDH xr10 //h0 mid0 mid1 mid2 mid3 4166 HADDWDH xr12 //h1 mid4 mid5 mid6 mid7 4167 HADDWDH xr14 //h2 4168 HADDWDH xr16 //h3 4169 4170 xvpackev.w xr10, xr12, xr10 4171 xvpackev.w xr14, xr16, xr14 4172 xvpackev.d xr12, xr14, xr10 4173 xvpackod.d xr10, xr14, xr10 4174 xvpickev.h xr12, xr10, xr12 4175 xvsrari.h xr12, xr12, 2 4176 4177 xvextrins.h xr18, xr12, 0x70 //0 4 8 12 16 20 24 0(x0) 2 6 10 14 18 22 26 2(x2) 4178 xvextrins.h xr19, xr12, 0x74 //1 5 9 13 17 21 25 0(x1) 3 7 11 15 19 23 27 2(x3) 4179 4180 xvdp2.w.h xr0, xr18, xr8 4181 xvdp2.w.h xr2, xr19, xr8 4182 HADDWQW xr0 4183 HADDWQW xr2 4184 xvpackev.w xr0, xr2, xr0 4185 4186 xvbsrl.v xr18, xr18, 2 4187 xvbsrl.v xr19, xr19, 2 4188 xvextrins.h xr18, xr12, 0x71 4189 xvextrins.h xr19, xr12, 0x75 4190 xvdp2.w.h xr2, xr18, xr8 4191 xvdp2.w.h xr4, xr19, xr8 4192 HADDWQW xr2 4193 HADDWQW xr4 4194 xvpackev.w xr2, xr4, xr2 4195 4196 xvbsrl.v xr18, xr18, 2 4197 xvbsrl.v xr19, xr19, 2 4198 xvextrins.h xr18, xr12, 0x72 4199 xvextrins.h xr19, xr12, 0x76 4200 xvdp2.w.h xr4, xr18, xr8 4201 xvdp2.w.h xr9, xr19, xr8 4202 HADDWQW xr4 4203 HADDWQW xr9 4204 xvpackev.w xr4, xr9, xr4 4205 4206 xvbsrl.v xr18, xr18, 2 4207 xvbsrl.v xr19, xr19, 2 4208 xvextrins.h xr18, xr12, 0x73 4209 xvextrins.h xr19, xr12, 0x77 4210 xvdp2.w.h xr9, xr18, xr8 4211 xvdp2.w.h xr11, xr19, xr8 4212 HADDWQW xr9 4213 HADDWQW xr11 4214 xvpackev.w xr9, xr11, xr9 4215 4216 xvpackev.d xr0, xr2, xr0 4217 xvpackev.d xr4, xr9, xr4 4218 xvsrari.w xr0, xr0, 6 4219 xvsrari.w xr4, xr4, 6 4220 xvpermi.d xr0, xr0, 0xd8 4221 xvpermi.d xr4, xr4, 0xd8 4222 xvpickev.h xr0, xr4, xr0 4223 xvpermi.d xr0, xr0, 0xd8 4224 xvst xr0, a0, 0 4225 addi.d a0, a0, 32 4226 4227 xvbsrl.v xr18, xr18, 2 4228 xvbsrl.v xr19, xr19, 2 4229 4230 addi.d a4, a4, -4 4231 bnez a4, .l_\lable\()hv_w4_loop 4232 b .l_\lable\()end_pre_8tap 4233 4234.l_\lable\()hv_8w: 4235 addi.d t0, a1, 0 4236 addi.d t5, a4, 0 4237 srli.w t7, a3, 3 4238 slli.w t7, t7, 4 // store offset 4239 addi.d t8, a0, 0 4240.l_\lable\()hv_8w_loop0: 4241 xvld xr0, a1, 0 4242 xvldx xr2, a1, a2 4243 xvldx xr4, a1, t2 4244 xvldx xr6, a1, t3 4245 4246 add.d a1, a1, t4 4247 xvld xr10, a1, 0 4248 xvldx xr11, a1, a2 4249 xvldx xr12, a1, t2 4250 4251 xvbsrl.v xr1, xr0, 4 4252 xvbsrl.v xr3, xr2, 4 4253 xvbsrl.v xr5, xr4, 4 4254 xvbsrl.v xr7, xr6, 4 4255 4256 SHUFB xr0, xr23, xr9, xr13 4257 SHUFB xr1, xr23, xr9, xr14 4258 SHUFB xr2, xr23, xr9, xr15 4259 SHUFB xr3, xr23, xr9, xr16 4260 SHUFB xr4, xr23, xr9, xr17 4261 SHUFB xr5, xr23, xr9, xr18 4262 SHUFB xr6, xr23, xr9, xr19 4263 SHUFB xr7, xr23, xr9, xr20 4264 4265 xvdp2.h.bu.b xr0, xr13, xr22 4266 xvdp2.h.bu.b xr1, xr14, xr22 4267 xvdp2.h.bu.b xr2, xr15, xr22 4268 xvdp2.h.bu.b xr3, xr16, xr22 4269 xvdp2.h.bu.b xr4, xr17, xr22 4270 xvdp2.h.bu.b xr5, xr18, xr22 4271 xvdp2.h.bu.b xr6, xr19, xr22 4272 xvdp2.h.bu.b xr7, xr20, xr22 4273 4274 HADDWDH xr0 4275 HADDWDH xr1 4276 HADDWDH xr2 4277 HADDWDH xr3 4278 HADDWDH xr4 4279 HADDWDH xr5 4280 HADDWDH xr6 4281 HADDWDH xr7 4282 4283 xvpackev.w xr0, xr2, xr0 4284 xvpackev.w xr2, xr6, xr4 4285 xvpackev.d xr16, xr2, xr0 4286 xvpackod.d xr0, xr2, xr0 4287 xvpickev.h xr0, xr0, xr16 4288 xvsrari.h xr0, xr0, 2 // 0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27 4289 4290 xvpackev.w xr1, xr3, xr1 4291 xvpackev.w xr3, xr7, xr5 4292 xvpackev.d xr16, xr3, xr1 4293 xvpackod.d xr1, xr3, xr1 4294 xvpickev.h xr1, xr1, xr16 4295 xvsrari.h xr1, xr1, 2 // 4 12 20 28 5 13 21 29 6 14 22 30 7 15 23 31 4296 4297 xvbsrl.v xr13, xr10, 4 4298 xvbsrl.v xr14, xr11, 4 4299 xvbsrl.v xr15, xr12, 4 4300 4301 SHUFB xr10, xr23, xr9, xr10 4302 SHUFB xr13, xr23, xr9, xr13 4303 SHUFB xr11, xr23, xr9, xr11 4304 SHUFB xr14, xr23, xr9, xr14 4305 SHUFB xr12, xr23, xr9, xr12 4306 SHUFB xr15, xr23, xr9, xr15 4307 4308 xvdp2.h.bu.b xr4, xr10, xr22 4309 xvdp2.h.bu.b xr5, xr13, xr22 4310 xvdp2.h.bu.b xr6, xr11, xr22 4311 xvdp2.h.bu.b xr7, xr14, xr22 4312 xvdp2.h.bu.b xr9, xr12, xr22 4313 xvdp2.h.bu.b xr10, xr15, xr22 4314 4315 HADDWDH xr4 4316 HADDWDH xr5 4317 HADDWDH xr6 4318 HADDWDH xr7 4319 HADDWDH xr9 4320 HADDWDH xr10 4321 4322 xvpackev.w xr4, xr6, xr4 4323 xvpackev.w xr9, xr12, xr9 4324 xvpackev.d xr16, xr9, xr4 4325 xvpackod.d xr11, xr9, xr4 4326 xvpickev.h xr2, xr11, xr16 4327 xvsrari.h xr2, xr2, 2 // 32 40 48 * 33 41 49 * 34 42 50 * 35 43 51 * 4328 4329 xvpackev.w xr5, xr7, xr5 4330 xvpackev.w xr10, xr12, xr10 4331 xvpackev.d xr16, xr10, xr5 4332 xvpackod.d xr11, xr10, xr5 4333 xvpickev.h xr3, xr11, xr16 4334 xvsrari.h xr3, xr3, 2 // 36 44 52 * 37 45 53 * 38 46 54 * 39 47 56 * 4335 4336 xvpackev.d xr18, xr2, xr0 // 0 8 16 24 32 40 48 * 2 10 18 26 34 42 50 * 4337 xvpackod.d xr19, xr2, xr0 // 1 9 17 25 33 41 49 * 3 11 19 27 35 43 51 * 4338 xvpackev.d xr20, xr3, xr1 // 4 12 20 28 36 44 52 * 6 14 22 30 38 46 54 * 4339 xvpackod.d xr21, xr3, xr1 // 5 13 21 29 37 45 53 * 7 15 23 31 39 47 55 * 4340 4341.l_\lable\()hv_8w_loop: 4342 xvldx xr0, a1, t3 4343 add.d a1, a1, t4 4344 xvld xr2, a1, 0 4345 xvldx xr4, a1, a2 4346 xvldx xr6, a1, t2 4347 4348 xvbsrl.v xr1, xr0, 4 4349 xvbsrl.v xr3, xr2, 4 4350 xvbsrl.v xr5, xr4, 4 4351 xvbsrl.v xr7, xr6, 4 4352 4353 SHUFB xr0, xr23, xr9, xr0 4354 SHUFB xr1, xr23, xr9, xr1 4355 SHUFB xr2, xr23, xr9, xr2 4356 SHUFB xr3, xr23, xr9, xr3 4357 SHUFB xr4, xr23, xr9, xr4 4358 SHUFB xr5, xr23, xr9, xr5 4359 SHUFB xr6, xr23, xr9, xr6 4360 SHUFB xr7, xr23, xr9, xr7 4361 4362 xvdp2.h.bu.b xr10, xr0, xr22 4363 xvdp2.h.bu.b xr11, xr1, xr22 4364 xvdp2.h.bu.b xr12, xr2, xr22 4365 xvdp2.h.bu.b xr13, xr3, xr22 4366 xvdp2.h.bu.b xr14, xr4, xr22 4367 xvdp2.h.bu.b xr15, xr5, xr22 4368 xvdp2.h.bu.b xr16, xr6, xr22 4369 xvdp2.h.bu.b xr17, xr7, xr22 4370 4371 HADDWDH xr10 4372 HADDWDH xr11 4373 HADDWDH xr12 4374 HADDWDH xr13 4375 HADDWDH xr14 4376 HADDWDH xr15 4377 HADDWDH xr16 4378 HADDWDH xr17 4379 4380 xvpackev.w xr0, xr12, xr10 4381 xvpackev.w xr2, xr16, xr14 4382 xvpackev.d xr9, xr2, xr0 4383 xvpackod.d xr0, xr2, xr0 4384 xvpickev.h xr0, xr0, xr9 4385 xvsrari.h xr0, xr0, 2 // 56 64 72 80 57 65 73 81 58 66 74 82 59 67 75 83 4386 4387 xvpackev.w xr1, xr13, xr11 4388 xvpackev.w xr3, xr17, xr15 4389 xvpackev.d xr9, xr3, xr1 4390 xvpackod.d xr1, xr3, xr1 4391 xvpickev.h xr1, xr1, xr9 4392 xvsrari.h xr1, xr1, 2 // 60 68 76 84 61 69 77 85 62 70 78 86 63 71 79 87 4393 4394 xvextrins.h xr18, xr0, 0x70 // 0 8 16 24 32 40 48 (56) 2 10 18 26 34 42 50 (58) 4395 xvextrins.h xr19, xr0, 0x74 // 1 9 17 25 33 41 49 (57) 3 11 19 27 35 43 51 (59) 4396 xvextrins.h xr20, xr1, 0x70 4397 xvextrins.h xr21, xr1, 0x74 4398 4399 //h - 1 4400 xvdp2.w.h xr10, xr18, xr8 4401 xvdp2.w.h xr11, xr19, xr8 4402 xvdp2.w.h xr12, xr20, xr8 4403 xvdp2.w.h xr13, xr21, xr8 4404 4405 HADDWQW xr10 4406 HADDWQW xr11 4407 HADDWQW xr12 4408 HADDWQW xr13 4409 4410 xvpackev.w xr2, xr11, xr10 //0 1 * * 2 3 * * 4411 xvpackev.w xr3, xr13, xr12 //4 5 * * 6 7 * * 4412 xvpackev.d xr2, xr3, xr2 //0 1 4 5 2 3 6 7 4413 //h - 2 4414 xvbsrl.v xr4, xr18, 2 4415 xvbsrl.v xr5, xr19, 2 4416 xvbsrl.v xr6, xr20, 2 4417 xvbsrl.v xr7, xr21, 2 4418 xvextrins.h xr4, xr0, 0x71 4419 xvextrins.h xr5, xr0, 0x75 4420 xvextrins.h xr6, xr1, 0x71 4421 xvextrins.h xr7, xr1, 0x75 4422 4423 xvdp2.w.h xr10, xr4, xr8 4424 xvdp2.w.h xr11, xr5, xr8 4425 xvdp2.w.h xr12, xr6, xr8 4426 xvdp2.w.h xr13, xr7, xr8 4427 4428 HADDWQW xr10 4429 HADDWQW xr11 4430 HADDWQW xr12 4431 HADDWQW xr13 4432 4433 xvpackev.w xr14, xr11, xr10 4434 xvpackev.w xr15, xr13, xr12 4435 xvpackev.d xr14, xr15, xr14 //8 9 12 13 10 11 14 15 4436 //h - 3 4437 xvbsrl.v xr4, xr4, 2 4438 xvbsrl.v xr5, xr5, 2 4439 xvbsrl.v xr6, xr6, 2 4440 xvbsrl.v xr7, xr7, 2 4441 xvextrins.h xr4, xr0, 0x72 4442 xvextrins.h xr5, xr0, 0x76 4443 xvextrins.h xr6, xr1, 0x72 4444 xvextrins.h xr7, xr1, 0x76 4445 4446 xvdp2.w.h xr10, xr4, xr8 4447 xvdp2.w.h xr11, xr5, xr8 4448 xvdp2.w.h xr12, xr6, xr8 4449 xvdp2.w.h xr13, xr7, xr8 4450 4451 HADDWQW xr10 4452 HADDWQW xr11 4453 HADDWQW xr12 4454 HADDWQW xr13 4455 4456 xvpackev.w xr15, xr11, xr10 4457 xvpackev.w xr16, xr13, xr12 4458 xvpackev.d xr15, xr16, xr15 //16 17 20 21 18 19 22 23 4459 //h - 4 4460 xvbsrl.v xr4, xr4, 2 4461 xvbsrl.v xr5, xr5, 2 4462 xvbsrl.v xr6, xr6, 2 4463 xvbsrl.v xr7, xr7, 2 4464 xvextrins.h xr4, xr0, 0x73 4465 xvextrins.h xr5, xr0, 0x77 4466 xvextrins.h xr6, xr1, 0x73 4467 xvextrins.h xr7, xr1, 0x77 4468 4469 xvdp2.w.h xr10, xr4, xr8 4470 xvdp2.w.h xr11, xr5, xr8 4471 xvdp2.w.h xr12, xr6, xr8 4472 xvdp2.w.h xr13, xr7, xr8 4473 4474 HADDWQW xr10 4475 HADDWQW xr11 4476 HADDWQW xr12 4477 HADDWQW xr13 4478 4479 xvpackev.w xr16, xr11, xr10 4480 xvpackev.w xr17, xr13, xr12 4481 xvpackev.d xr16, xr17, xr16 //24 25 28 29 26 27 30 31 4482 4483 xvsrari.w xr2, xr2, 6 4484 xvsrari.w xr14, xr14, 6 4485 xvsrari.w xr15, xr15, 6 4486 xvsrari.w xr16, xr16, 6 4487 4488 xvpermi.d xr2, xr2, 0xd8 4489 xvpermi.d xr14, xr14, 0xd8 4490 xvpermi.d xr15, xr15, 0xd8 4491 xvpermi.d xr16, xr16, 0xd8 4492 xvpickev.h xr2, xr14, xr2 4493 xvpickev.h xr3, xr16, xr15 4494 xvpermi.d xr2, xr2, 0xd8 4495 xvpermi.d xr3, xr3, 0xd8 4496 4497 xvpermi.q xr10, xr2, 0x31 4498 xvpermi.q xr11, xr3, 0x31 4499 4500 vst vr2, a0, 0 4501 vstx vr10, a0, t7 //32 4502 slli.w t1, t7, 1 //64 4503 vstx vr3, a0, t1 4504 add.w t1, t1, t7 //96 4505 vstx vr11, a0, t1 4506 slli.w t1, t7, 2 //128 4507 add.d a0, a0, t1 4508 4509 xvbsrl.v xr18, xr4, 2 4510 xvbsrl.v xr19, xr5, 2 4511 xvbsrl.v xr20, xr6, 2 4512 xvbsrl.v xr21, xr7, 2 4513 4514 addi.d a4, a4, -4 4515 bnez a4, .l_\lable\()hv_8w_loop 4516 4517 addi.d a1, t0, 8 4518 addi.d t0, t0, 8 4519 addi.d a0, t8, 16 4520 addi.d t8, t8, 16 4521 addi.d a4, t5, 0 4522 addi.d a3, a3, -8 4523 bnez a3, .l_\lable\()hv_8w_loop0 4524 b .l_\lable\()end_pre_8tap 4525.l_\lable\()v: 4526 4527 srli.w a7, a7, 2 4528 blt t0, a4, .l_\lable\()v_idx_fv 4529 andi a7, a7, 1 4530 addi.w a7, a7, 3 4531.l_\lable\()v_idx_fv: 4532 addi.w t5, zero, 120 4533 mul.w a7, a7, t5 4534 addi.w t5, a6, -1 4535 slli.w t5, t5, 3 4536 add.w a7, a7, t5 4537 add.d a7, t6, a7 //fv's offset 4538 xvldrepl.d xr8, a7, 0 4539 4540 sub.d a1, a1, t3 4541 beq a3, t0, .l_\lable\()v_4w 4542 blt t0, a3, .l_\lable\()v_8w 4543.l_\lable\()v_4w: 4544 fld.s f0, a1, 0 4545 fldx.s f1, a1, a2 4546 fldx.s f2, a1, t2 4547 add.d a1, a1, t3 4548 fld.s f3, a1, 0 4549 fldx.s f4, a1, a2 4550 fldx.s f5, a1, t2 4551 fldx.s f6, a1, t3 4552 4553 xvilvl.b xr0, xr1, xr0 // 0 1 8 9 16 17 24 25 4554 xvilvl.b xr1, xr3, xr2 // 2 3 10 11 18 19 26 27 4555 xvilvl.b xr2, xr5, xr4 // 4 5 12 13 20 21 28 29 4556 xvilvl.b xr3, xr7, xr6 // 6 7 14 15 22 23 30 31 4557 xvilvl.h xr0, xr1, xr0 // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 4558 xvilvl.h xr1, xr3, xr2 // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31 4559 xvilvl.w xr2, xr1, xr0 4560 xvilvh.w xr0, xr1, xr0 4561 xvpermi.q xr0, xr2, 0x20 4562 4563.l_\lable\()v_4w_loop: 4564 add.d a1, a1, t4 4565 fld.s f7, a1, 0 //h0 4566 fldx.s f10, a1, a2 //h1 4567 fldx.s f11, a1, t2 //h2 4568 fldx.s f12, a1, t3 //h3 4569 4570 xvbsrl.v xr9, xr7, 2 4571 xvpermi.q xr9, xr7, 0x20 4572 xvextrins.b xr0, xr9, 0x70 4573 xvextrins.b xr0, xr9, 0xf1 4574 4575 xvbsrl.v xr1, xr0, 1 4576 xvbsrl.v xr7, xr10, 2 4577 xvpermi.q xr7, xr10, 0x20 4578 xvextrins.b xr1, xr7, 0x70 4579 xvextrins.b xr1, xr7, 0xf1 4580 4581 xvbsrl.v xr2, xr1, 1 4582 xvbsrl.v xr7, xr11, 2 4583 xvpermi.q xr7, xr11, 0x20 4584 xvextrins.b xr2, xr7, 0x70 4585 xvextrins.b xr2, xr7, 0xf1 4586 4587 xvbsrl.v xr3, xr2, 1 4588 xvbsrl.v xr7, xr12, 2 4589 xvpermi.q xr7, xr12, 0x20 4590 xvextrins.b xr3, xr7, 0x70 4591 xvextrins.b xr3, xr7, 0xf1 4592 xvbsrl.v xr4, xr3, 1 4593 4594 xvdp2.h.bu.b xr10, xr0, xr8 4595 xvdp2.h.bu.b xr11, xr1, xr8 4596 xvdp2.h.bu.b xr12, xr2, xr8 4597 xvdp2.h.bu.b xr13, xr3, xr8 4598 HADDWDH xr10 4599 HADDWDH xr11 4600 HADDWDH xr12 4601 HADDWDH xr13 4602 xvpickev.w xr10, xr11, xr10 4603 xvpickev.w xr11, xr13, xr12 4604 xvpermi.d xr10, xr10, 0xd8 4605 xvpermi.d xr11, xr11, 0xd8 4606 xvpickev.h xr10, xr11, xr10 4607 xvpermi.d xr10, xr10, 0xd8 4608 xvsrari.h xr10, xr10, 2 4609 4610 xvaddi.bu xr0, xr4, 0 4611 4612 xvst xr10, a0, 0 4613 addi.d a0, a0, 32 4614 addi.w a4, a4, -4 4615 bnez a4, .l_\lable\()v_4w_loop 4616 b .l_\lable\()end_pre_8tap 4617 4618.l_\lable\()v_8w: 4619 addi.d t0, a1, 0 4620 addi.d t5, a4, 0 4621 srli.w t7, a3, 2 4622 slli.w t7, t7, 3 4623 addi.d t8, a0, 0 4624.l_\lable\()v_8w_loop0: 4625 fld.s f0, a1, 0 4626 fldx.s f1, a1, a2 4627 fldx.s f2, a1, t2 4628 add.d a1, a1, t3 4629 fld.s f3, a1, 0 4630 fldx.s f4, a1, a2 4631 fldx.s f5, a1, t2 4632 fldx.s f6, a1, t3 4633 4634 xvilvl.b xr0, xr1, xr0 // 0 1 8 9 16 17 24 25 4635 xvilvl.b xr1, xr3, xr2 // 2 3 10 11 18 19 26 27 4636 xvilvl.b xr2, xr5, xr4 // 4 5 12 13 20 21 28 29 4637 xvilvl.b xr3, xr7, xr6 // 6 7 14 15 22 23 30 31 4638 xvilvl.h xr0, xr1, xr0 // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 4639 xvilvl.h xr1, xr3, xr2 // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31 4640 xvilvl.w xr2, xr1, xr0 4641 xvilvh.w xr0, xr1, xr0 4642 xvpermi.q xr0, xr2, 0x20 4643 4644.l_\lable\()v_8w_loop: 4645 add.d a1, a1, t4 4646 fld.s f7, a1, 0 //h0 4647 fldx.s f10, a1, a2 //h1 4648 fldx.s f11, a1, t2 //h2 4649 fldx.s f12, a1, t3 //h3 4650 4651 xvbsrl.v xr9, xr7, 2 4652 xvpermi.q xr9, xr7, 0x20 4653 xvextrins.b xr0, xr9, 0x70 4654 xvextrins.b xr0, xr9, 0xf1 4655 4656 xvbsrl.v xr1, xr0, 1 4657 xvbsrl.v xr7, xr10, 2 4658 xvpermi.q xr7, xr10, 0x20 4659 xvextrins.b xr1, xr7, 0x70 4660 xvextrins.b xr1, xr7, 0xf1 4661 4662 xvbsrl.v xr2, xr1, 1 4663 xvbsrl.v xr7, xr11, 2 4664 xvpermi.q xr7, xr11, 0x20 4665 xvextrins.b xr2, xr7, 0x70 4666 xvextrins.b xr2, xr7, 0xf1 4667 4668 xvbsrl.v xr3, xr2, 1 4669 xvbsrl.v xr7, xr12, 2 4670 xvpermi.q xr7, xr12, 0x20 4671 xvextrins.b xr3, xr7, 0x70 4672 xvextrins.b xr3, xr7, 0xf1 4673 xvbsrl.v xr4, xr3, 1 4674 4675 xvdp2.h.bu.b xr10, xr0, xr8 4676 xvdp2.h.bu.b xr11, xr1, xr8 4677 xvdp2.h.bu.b xr12, xr2, xr8 4678 xvdp2.h.bu.b xr13, xr3, xr8 4679 HADDWDH xr10 4680 HADDWDH xr11 4681 HADDWDH xr12 4682 HADDWDH xr13 4683 xvpickev.w xr10, xr11, xr10 4684 xvpickev.w xr11, xr13, xr12 4685 xvpermi.d xr10, xr10, 0xd8 4686 xvpermi.d xr11, xr11, 0xd8 4687 xvpickev.h xr10, xr11, xr10 4688 xvpermi.d xr10, xr10, 0xd8 4689 xvsrari.h xr10, xr10, 2 4690 4691 xvaddi.bu xr0, xr4, 0 4692 4693 xvstelm.d xr10, a0, 0, 0 4694 add.d a0, a0, t7 4695 xvstelm.d xr10, a0, 0, 1 4696 add.d a0, a0, t7 4697 xvstelm.d xr10, a0, 0, 2 4698 add.d a0, a0, t7 4699 xvstelm.d xr10, a0, 0, 3 4700 add.d a0, a0, t7 4701 addi.w a4, a4, -4 4702 bnez a4, .l_\lable\()v_8w_loop 4703 4704 addi.d a1, t0, 4 4705 addi.d t0, t0, 4 4706 addi.d a0, t8, 8 4707 addi.d t8, t8, 8 4708 addi.d a4, t5, 0 4709 addi.d a3, a3, -4 4710 bnez a3, .l_\lable\()v_8w_loop0 4711 4712.l_\lable\()end_pre_8tap: 4713.endm 4714 4715function prep_8tap_regular_8bpc_lasx 4716 addi.w a7, zero, 0 4717 PREP_8TAP_8BPC_LASX 0 4718endfunc 4719 4720function prep_8tap_smooth_regular_8bpc_lasx 4721 addi.w a7, zero, 1 4722 PREP_8TAP_8BPC_LASX 1 4723endfunc 4724 4725function prep_8tap_sharp_regular_8bpc_lasx 4726 addi.w a7, zero, 2 4727 PREP_8TAP_8BPC_LASX 2 4728endfunc 4729 4730function prep_8tap_regular_smooth_8bpc_lasx 4731 addi.w a7, zero, 4 4732 PREP_8TAP_8BPC_LASX 4 4733endfunc 4734 4735function prep_8tap_smooth_8bpc_lasx 4736 addi.w a7, zero, 5 4737 PREP_8TAP_8BPC_LASX 5 4738endfunc 4739 4740function prep_8tap_sharp_smooth_8bpc_lasx 4741 addi.w a7, zero, 6 4742 PREP_8TAP_8BPC_LASX 6 4743endfunc 4744 4745function prep_8tap_regular_sharp_8bpc_lasx 4746 addi.w a7, zero, 8 4747 PREP_8TAP_8BPC_LASX 8 4748endfunc 4749 4750function prep_8tap_smooth_sharp_8bpc_lasx 4751 addi.w a7, zero, 9 4752 PREP_8TAP_8BPC_LASX 9 4753endfunc 4754 4755function prep_8tap_sharp_8bpc_lasx 4756 addi.w a7, zero, 10 4757 PREP_8TAP_8BPC_LASX 10 4758endfunc 4759