1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 64 30 31%macro SMOOTH_WEIGHTS 1-* 32const smooth_weights_1d_16bpc ; sm_weights[] << 7 33 %rep %0 34 dw %1*128 35 %rotate 1 36 %endrep 37const smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[] 38 %rep %0 39 dw %1, 256-%1 40 %rotate 1 41 %endrep 42%endmacro 43 44SMOOTH_WEIGHTS 0, 0, 255, 128, 255, 149, 85, 64, \ 45 255, 197, 146, 105, 73, 50, 37, 32, \ 46 255, 225, 196, 170, 145, 123, 102, 84, \ 47 68, 54, 43, 33, 26, 20, 17, 16, \ 48 255, 240, 225, 210, 196, 182, 169, 157, \ 49 145, 133, 122, 111, 101, 92, 83, 74, \ 50 66, 59, 52, 45, 39, 34, 29, 25, \ 51 21, 17, 14, 12, 10, 9, 8, 8, \ 52 255, 248, 240, 233, 225, 218, 210, 203, \ 53 196, 189, 182, 176, 169, 163, 156, 150, \ 54 144, 138, 133, 127, 121, 116, 111, 106, \ 55 101, 96, 91, 86, 82, 77, 73, 69, \ 56 65, 61, 57, 54, 50, 47, 44, 41, \ 57 38, 35, 32, 29, 27, 25, 22, 20, \ 58 18, 16, 15, 13, 12, 10, 9, 8, \ 59 7, 6, 6, 5, 5, 4, 4, 4 60 61%if ARCH_X86_64 62 63ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11 64 db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15 65filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1 66filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 67filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1 68pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 69z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 70 dw 8*64, 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64 71z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 72z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 73z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 74 db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 75pw_m1024: times 2 dw -1024 76pw_1to16: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 77pw_16to1: dw 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 78z2_ymul: dw 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4 79z2_ymul8: dw 1, 2, 5, 6, 3, 4, 7, 8, 5, 6, 16, 16, 7, 8 80pb_90: times 4 db 90 81z2_y_shuf_h4: dd 3, 7, 2, 6, 1, 5, 0, 4 82z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 83z2_x_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 84z2_y_shuf: db 6, 7, 14, 15, 4, 5, 12, 13, 4, 5, 12, 13, 2, 3, 10, 11 85z2_y_shuf_us: db 6, 7, 14, 15, 2, 3, 10, 11, 4, 5, 12, 13, 0, 1, 8, 9 86z_filter_k: dw 4, 4, 5, 5, 4, 4 87 dw 8, 8, 6, 6, 4, 4 88 dw 0, 0, 0, 0, 2, 2 89 90%define pw_2 (z_filter_k+32) 91%define pw_4 (z_filter_k+ 0) 92%define pw_16 (z2_ymul8 +20) 93 94pw_1: times 2 dw 1 95pw_3: times 2 dw 3 96pw_62: times 2 dw 62 97pw_512: times 2 dw 512 98pw_2048: times 2 dw 2048 99pd_8: dd 8 100 101%macro JMP_TABLE 3-* 102 %xdefine %1_%2_table (%%table - 2*4) 103 %xdefine %%base mangle(private_prefix %+ _%1_%2) 104 %%table: 105 %rep %0 - 2 106 dd %%base %+ .%3 - (%%table - 2*4) 107 %rotate 1 108 %endrep 109%endmacro 110 111%define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4) 112%define ipred_cfl_splat_16bpc_avx2_table (ipred_cfl_16bpc_avx2_table + 8*4) 113 114JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ 115 s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 116JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64 117JMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64 118JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64 119JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64 120JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64 121JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64 122JMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64 123JMP_TABLE ipred_z2_16bpc, avx2, w4, w8, w16, w32, w64 124JMP_TABLE ipred_z3_16bpc, avx2, h4, h8, h16, h32, h64 125JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32 126JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ 127 s4-8*4, s8-8*4, s16-8*4, s32-8*4 128JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32 129JMP_TABLE ipred_cfl_ac_444_16bpc, avx2, w4, w8, w16, w32 130JMP_TABLE pal_pred_16bpc, avx2, w4, w8, w16, w32, w64 131 132cextern dr_intra_derivative 133cextern filter_intra_taps 134 135SECTION .text 136 137INIT_YMM avx2 138cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h 139 movifnidn hd, hm 140 add tlq, 2 141 movd xm4, wd 142 pxor xm3, xm3 143 pavgw xm4, xm3 144 tzcnt wd, wd 145 movd xm5, wd 146 movu m0, [tlq] 147 lea r5, [ipred_dc_left_16bpc_avx2_table] 148 movsxd r6, [r5+wq*4] 149 add r6, r5 150 add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table 151 movsxd wq, [r5+wq*4] 152 add wq, r5 153 jmp r6 154 155cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 156 mov hd, hm 157 sub tlq, hq 158 movd xm4, hd 159 sub tlq, hq 160 pxor xm3, xm3 161 pavgw xm4, xm3 162 tzcnt r6d, hd 163 movd xm5, r6d 164 movu m0, [tlq] 165 lea r5, [ipred_dc_left_16bpc_avx2_table] 166 movsxd r6, [r5+r6*4] 167 add r6, r5 168 add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table 169 tzcnt wd, wd 170 movsxd wq, [r5+wq*4] 171 add wq, r5 172 jmp r6 173.h64: 174 paddw m0, [tlq+96] 175 paddw m0, [tlq+64] 176.h32: 177 paddw m0, [tlq+32] 178.h16: 179 vextracti128 xm1, m0, 1 180 paddw xm0, xm1 181.h8: 182 psrldq xm1, xm0, 8 183 paddw xm0, xm1 184.h4: 185 punpcklwd xm0, xm3 186 psrlq xm1, xm0, 32 187 paddd xm0, xm1 188 psrldq xm1, xm0, 8 189 paddd xm0, xm1 190 paddd xm0, xm4 191 psrld xm0, xm5 192 lea stride3q, [strideq*3] 193 vpbroadcastw m0, xm0 194 mova m1, m0 195 mova m2, m0 196 mova m3, m0 197 jmp wq 198 199cglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 200 movifnidn hd, hm 201 tzcnt r6d, hd 202 lea r5d, [wq+hq] 203 movd xm4, r5d 204 tzcnt r5d, r5d 205 movd xm5, r5d 206 lea r5, [ipred_dc_16bpc_avx2_table] 207 tzcnt wd, wd 208 movsxd r6, [r5+r6*4] 209 movsxd wq, [r5+wq*4+5*4] 210 pxor m3, m3 211 psrlw xm4, 1 212 add r6, r5 213 add wq, r5 214 lea stride3q, [strideq*3] 215 jmp r6 216.h4: 217 movq xm0, [tlq-8] 218 jmp wq 219.w4: 220 movq xm1, [tlq+2] 221 paddw m0, m4 222 paddw m0, m1 223 psrlq m1, m0, 32 224 paddw m0, m1 225 psrld m1, m0, 16 226 paddw m0, m1 227 cmp hd, 4 228 jg .w4_mul 229 psrlw xm0, 3 230 jmp .w4_end 231.w4_mul: 232 vextracti128 xm1, m0, 1 233 paddw xm0, xm1 234 lea r2d, [hq*2] 235 mov r6d, 0xAAAB6667 236 shrx r6d, r6d, r2d 237 punpckhwd xm1, xm0, xm3 238 punpcklwd xm0, xm3 239 paddd xm0, xm1 240 movd xm1, r6d 241 psrld xm0, 2 242 pmulhuw xm0, xm1 243 psrlw xm0, 1 244.w4_end: 245 vpbroadcastw xm0, xm0 246.s4: 247 movq [dstq+strideq*0], xm0 248 movq [dstq+strideq*1], xm0 249 movq [dstq+strideq*2], xm0 250 movq [dstq+stride3q ], xm0 251 lea dstq, [dstq+strideq*4] 252 sub hd, 4 253 jg .s4 254 RET 255ALIGN function_align 256.h8: 257 mova xm0, [tlq-16] 258 jmp wq 259.w8: 260 vextracti128 xm1, m0, 1 261 paddw xm0, [tlq+2] 262 paddw xm0, xm4 263 paddw xm0, xm1 264 psrld xm1, xm0, 16 265 paddw xm0, xm1 266 pblendw xm0, xm3, 0xAA 267 psrlq xm1, xm0, 32 268 paddd xm0, xm1 269 psrldq xm1, xm0, 8 270 paddd xm0, xm1 271 psrld xm0, xm5 272 cmp hd, 8 273 je .w8_end 274 mov r6d, 0xAAAB 275 mov r2d, 0x6667 276 cmp hd, 32 277 cmovz r6d, r2d 278 movd xm1, r6d 279 pmulhuw xm0, xm1 280 psrlw xm0, 1 281.w8_end: 282 vpbroadcastw xm0, xm0 283.s8: 284 mova [dstq+strideq*0], xm0 285 mova [dstq+strideq*1], xm0 286 mova [dstq+strideq*2], xm0 287 mova [dstq+stride3q ], xm0 288 lea dstq, [dstq+strideq*4] 289 sub hd, 4 290 jg .s8 291 RET 292ALIGN function_align 293.h16: 294 mova m0, [tlq-32] 295 jmp wq 296.w16: 297 paddw m0, [tlq+2] 298 vextracti128 xm1, m0, 1 299 paddw xm0, xm4 300 paddw xm0, xm1 301 punpckhwd xm1, xm0, xm3 302 punpcklwd xm0, xm3 303 paddd xm0, xm1 304 psrlq xm1, xm0, 32 305 paddd xm0, xm1 306 psrldq xm1, xm0, 8 307 paddd xm0, xm1 308 psrld xm0, xm5 309 cmp hd, 16 310 je .w16_end 311 mov r6d, 0xAAAB 312 mov r2d, 0x6667 313 test hb, 8|32 314 cmovz r6d, r2d 315 movd xm1, r6d 316 pmulhuw xm0, xm1 317 psrlw xm0, 1 318.w16_end: 319 vpbroadcastw m0, xm0 320.s16: 321 mova [dstq+strideq*0], m0 322 mova [dstq+strideq*1], m0 323 mova [dstq+strideq*2], m0 324 mova [dstq+stride3q ], m0 325 lea dstq, [dstq+strideq*4] 326 sub hd, 4 327 jg .s16 328 RET 329ALIGN function_align 330.h32: 331 mova m0, [tlq-64] 332 paddw m0, [tlq-32] 333 jmp wq 334.w32: 335 paddw m0, [tlq+ 2] 336 paddw m0, [tlq+34] 337 vextracti128 xm1, m0, 1 338 paddw xm0, xm4 339 paddw xm0, xm1 340 punpcklwd xm1, xm0, xm3 341 punpckhwd xm0, xm3 342 paddd xm0, xm1 343 psrlq xm1, xm0, 32 344 paddd xm0, xm1 345 psrldq xm1, xm0, 8 346 paddd xm0, xm1 347 psrld xm0, xm5 348 cmp hd, 32 349 je .w32_end 350 lea r2d, [hq*2] 351 mov r6d, 0x6667AAAB 352 shrx r6d, r6d, r2d 353 movd xm1, r6d 354 pmulhuw xm0, xm1 355 psrlw xm0, 1 356.w32_end: 357 vpbroadcastw m0, xm0 358 mova m1, m0 359.s32: 360 mova [dstq+strideq*0+32*0], m0 361 mova [dstq+strideq*0+32*1], m1 362 mova [dstq+strideq*1+32*0], m0 363 mova [dstq+strideq*1+32*1], m1 364 mova [dstq+strideq*2+32*0], m0 365 mova [dstq+strideq*2+32*1], m1 366 mova [dstq+stride3q +32*0], m0 367 mova [dstq+stride3q +32*1], m1 368 lea dstq, [dstq+strideq*4] 369 sub hd, 4 370 jg .s32 371 RET 372ALIGN function_align 373.h64: 374 mova m0, [tlq-128] 375 mova m1, [tlq- 96] 376 paddw m0, [tlq- 64] 377 paddw m1, [tlq- 32] 378 paddw m0, m1 379 jmp wq 380.w64: 381 movu m1, [tlq+ 2] 382 paddw m0, [tlq+34] 383 paddw m1, [tlq+66] 384 paddw m0, [tlq+98] 385 paddw m0, m1 386 vextracti128 xm1, m0, 1 387 paddw xm0, xm1 388 punpcklwd xm1, xm0, xm3 389 punpckhwd xm0, xm3 390 paddd xm1, xm4 391 paddd xm0, xm1 392 psrlq xm1, xm0, 32 393 paddd xm0, xm1 394 psrldq xm1, xm0, 8 395 paddd xm0, xm1 396 psrld xm0, xm5 397 cmp hd, 64 398 je .w64_end 399 mov r6d, 0x6667AAAB 400 shrx r6d, r6d, hd 401 movd xm1, r6d 402 pmulhuw xm0, xm1 403 psrlw xm0, 1 404.w64_end: 405 vpbroadcastw m0, xm0 406 mova m1, m0 407 mova m2, m0 408 mova m3, m0 409.s64: 410 mova [dstq+strideq*0+32*0], m0 411 mova [dstq+strideq*0+32*1], m1 412 mova [dstq+strideq*0+32*2], m2 413 mova [dstq+strideq*0+32*3], m3 414 mova [dstq+strideq*1+32*0], m0 415 mova [dstq+strideq*1+32*1], m1 416 mova [dstq+strideq*1+32*2], m2 417 mova [dstq+strideq*1+32*3], m3 418 lea dstq, [dstq+strideq*2] 419 sub hd, 2 420 jg .s64 421 RET 422 423cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 424 mov r6d, r8m 425 shr r6d, 11 426 lea r5, [ipred_dc_splat_16bpc_avx2_table] 427 tzcnt wd, wd 428 movifnidn hd, hm 429 movsxd wq, [r5+wq*4] 430 vpbroadcastd m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4] 431 mova m1, m0 432 mova m2, m0 433 mova m3, m0 434 add wq, r5 435 lea stride3q, [strideq*3] 436 jmp wq 437 438cglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 439 movifnidn hd, hm 440 movu m0, [tlq+ 2] 441 movu m1, [tlq+34] 442 movu m2, [tlq+66] 443 movu m3, [tlq+98] 444 lea r5, [ipred_dc_splat_16bpc_avx2_table] 445 tzcnt wd, wd 446 movsxd wq, [r5+wq*4] 447 add wq, r5 448 lea stride3q, [strideq*3] 449 jmp wq 450 451%macro IPRED_H 2 ; w, store_type 452 vpbroadcastw m0, [tlq-2] 453 vpbroadcastw m1, [tlq-4] 454 vpbroadcastw m2, [tlq-6] 455 vpbroadcastw m3, [tlq-8] 456 sub tlq, 8 457 mov%2 [dstq+strideq*0], m0 458 mov%2 [dstq+strideq*1], m1 459 mov%2 [dstq+strideq*2], m2 460 mov%2 [dstq+stride3q ], m3 461 lea dstq, [dstq+strideq*4] 462 sub hd, 4 463 jg .w%1 464 RET 465ALIGN function_align 466%endmacro 467 468cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 469 movifnidn hd, hm 470 lea r5, [ipred_h_16bpc_avx2_table] 471 tzcnt wd, wd 472 movsxd wq, [r5+wq*4] 473 add wq, r5 474 lea stride3q, [strideq*3] 475 jmp wq 476INIT_XMM avx2 477.w4: 478 IPRED_H 4, q 479.w8: 480 IPRED_H 8, a 481INIT_YMM avx2 482.w16: 483 IPRED_H 16, a 484.w32: 485 vpbroadcastw m0, [tlq-2] 486 vpbroadcastw m1, [tlq-4] 487 vpbroadcastw m2, [tlq-6] 488 vpbroadcastw m3, [tlq-8] 489 sub tlq, 8 490 mova [dstq+strideq*0+32*0], m0 491 mova [dstq+strideq*0+32*1], m0 492 mova [dstq+strideq*1+32*0], m1 493 mova [dstq+strideq*1+32*1], m1 494 mova [dstq+strideq*2+32*0], m2 495 mova [dstq+strideq*2+32*1], m2 496 mova [dstq+stride3q +32*0], m3 497 mova [dstq+stride3q +32*1], m3 498 lea dstq, [dstq+strideq*4] 499 sub hd, 4 500 jg .w32 501 RET 502.w64: 503 vpbroadcastw m0, [tlq-2] 504 vpbroadcastw m1, [tlq-4] 505 sub tlq, 4 506 mova [dstq+strideq*0+32*0], m0 507 mova [dstq+strideq*0+32*1], m0 508 mova [dstq+strideq*0+32*2], m0 509 mova [dstq+strideq*0+32*3], m0 510 mova [dstq+strideq*1+32*0], m1 511 mova [dstq+strideq*1+32*1], m1 512 mova [dstq+strideq*1+32*2], m1 513 mova [dstq+strideq*1+32*3], m1 514 lea dstq, [dstq+strideq*2] 515 sub hd, 2 516 jg .w64 517 RET 518 519%macro PAETH 3 ; top, signed_ldiff, ldiff 520 paddw m0, m%2, m1 521 psubw m7, m3, m0 ; tldiff 522 psubw m0, m%1 ; tdiff 523 pabsw m7, m7 524 pabsw m0, m0 525 pminsw m7, m0 526 pcmpeqw m0, m7 527 pcmpgtw m7, m%3, m7 528 vpblendvb m0, m3, m%1, m0 529 vpblendvb m0, m1, m0, m7 530%endmacro 531 532cglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h 533%define base r5-ipred_paeth_16bpc_avx2_table 534 movifnidn hd, hm 535 lea r5, [ipred_paeth_16bpc_avx2_table] 536 tzcnt wd, wd 537 movsxd wq, [r5+wq*4] 538 vpbroadcastw m3, [tlq] ; topleft 539 add wq, r5 540 jmp wq 541.w4: 542 vpbroadcastq m2, [tlq+2] ; top 543 movsldup m6, [base+ipred_hv_shuf] 544 lea r3, [strideq*3] 545 psubw m4, m2, m3 546 pabsw m5, m4 547.w4_loop: 548 sub tlq, 8 549 vpbroadcastq m1, [tlq] 550 pshufb m1, m6 ; left 551 PAETH 2, 4, 5 552 vextracti128 xm1, m0, 1 553 movq [dstq+strideq*0], xm0 554 movq [dstq+strideq*1], xm1 555 movhps [dstq+strideq*2], xm0 556 movhps [dstq+r3 ], xm1 557 lea dstq, [dstq+strideq*4] 558 sub hd, 4 559 jg .w4_loop 560 RET 561ALIGN function_align 562.w8: 563 vbroadcasti128 m2, [tlq+2] 564 movsldup m6, [base+ipred_hv_shuf] 565 psubw m4, m2, m3 566 pabsw m5, m4 567.w8_loop: 568 sub tlq, 4 569 vpbroadcastd m1, [tlq] 570 pshufb m1, m6 571 PAETH 2, 4, 5 572 mova [dstq+strideq*0], xm0 573 vextracti128 [dstq+strideq*1], m0, 1 574 lea dstq, [dstq+strideq*2] 575 sub hd, 2 576 jg .w8_loop 577 RET 578ALIGN function_align 579.w16: 580 movu m2, [tlq+2] 581 psubw m4, m2, m3 582 pabsw m5, m4 583.w16_loop: 584 sub tlq, 2 585 vpbroadcastw m1, [tlq] 586 PAETH 2, 4, 5 587 mova [dstq], m0 588 add dstq, strideq 589 dec hd 590 jg .w16_loop 591 RET 592ALIGN function_align 593.w32: 594 movu m2, [tlq+2] 595 movu m6, [tlq+34] 596%if WIN64 597 movaps r4m, xmm8 598 movaps r6m, xmm9 599%endif 600 psubw m4, m2, m3 601 psubw m8, m6, m3 602 pabsw m5, m4 603 pabsw m9, m8 604.w32_loop: 605 sub tlq, 2 606 vpbroadcastw m1, [tlq] 607 PAETH 2, 4, 5 608 mova [dstq+32*0], m0 609 PAETH 6, 8, 9 610 mova [dstq+32*1], m0 611 add dstq, strideq 612 dec hd 613 jg .w32_loop 614%if WIN64 615 movaps xmm8, r4m 616 movaps xmm9, r6m 617%endif 618 RET 619ALIGN function_align 620.w64: 621 WIN64_SPILL_XMM 16 622 movu m2, [tlq+ 2] 623 movu m6, [tlq+34] 624 movu m10, [tlq+66] 625 movu m13, [tlq+98] 626 psubw m4, m2, m3 627 psubw m8, m6, m3 628 psubw m11, m10, m3 629 psubw m14, m13, m3 630 pabsw m5, m4 631 pabsw m9, m8 632 pabsw m12, m11 633 pabsw m15, m14 634.w64_loop: 635 sub tlq, 2 636 vpbroadcastw m1, [tlq] 637 PAETH 2, 4, 5 638 mova [dstq+32*0], m0 639 PAETH 6, 8, 9 640 mova [dstq+32*1], m0 641 PAETH 10, 11, 12 642 mova [dstq+32*2], m0 643 PAETH 13, 14, 15 644 mova [dstq+32*3], m0 645 add dstq, strideq 646 dec hd 647 jg .w64_loop 648 RET 649 650cglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights 651%define base r6-ipred_smooth_v_16bpc_avx2_table 652 lea r6, [ipred_smooth_v_16bpc_avx2_table] 653 tzcnt wd, wm 654 mov hd, hm 655 movsxd wq, [r6+wq*4] 656 lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] 657 neg hq 658 vpbroadcastw m5, [tlq+hq*2] ; bottom 659 add wq, r6 660 jmp wq 661.w4: 662 vpbroadcastq m4, [tlq+2] ; top 663 movsldup m3, [base+ipred_hv_shuf] 664 lea r6, [strideq*3] 665 psubw m4, m5 ; top - bottom 666.w4_loop: 667 vpbroadcastq m0, [weightsq+hq*2] 668 pshufb m0, m3 669 pmulhrsw m0, m4 670 paddw m0, m5 671 vextracti128 xm1, m0, 1 672 movhps [dstq+strideq*0], xm1 673 movhps [dstq+strideq*1], xm0 674 movq [dstq+strideq*2], xm1 675 movq [dstq+r6 ], xm0 676 lea dstq, [dstq+strideq*4] 677 add hq, 4 678 jl .w4_loop 679.ret: 680 RET 681.w8: 682 vbroadcasti128 m4, [tlq+2] 683 movsldup m3, [base+ipred_hv_shuf] 684 lea r6, [strideq*3] 685 psubw m4, m5 686.w8_loop: 687 vpbroadcastd m0, [weightsq+hq*2+0] 688 vpbroadcastd m1, [weightsq+hq*2+4] 689 pshufb m0, m3 690 pshufb m1, m3 691 pmulhrsw m0, m4 692 pmulhrsw m1, m4 693 paddw m0, m5 694 paddw m1, m5 695 vextracti128 [dstq+strideq*0], m0, 1 696 mova [dstq+strideq*1], xm0 697 vextracti128 [dstq+strideq*2], m1, 1 698 mova [dstq+r6 ], xm1 699 lea dstq, [dstq+strideq*4] 700 add hq, 4 701 jl .w8_loop 702 RET 703.w16: 704 movu m4, [tlq+2] 705 lea r6, [strideq*3] 706 psubw m4, m5 707.w16_loop: 708 vpbroadcastw m0, [weightsq+hq*2+0] 709 vpbroadcastw m1, [weightsq+hq*2+2] 710 vpbroadcastw m2, [weightsq+hq*2+4] 711 vpbroadcastw m3, [weightsq+hq*2+6] 712 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 713 REPX {paddw x, m5}, m0, m1, m2, m3 714 mova [dstq+strideq*0], m0 715 mova [dstq+strideq*1], m1 716 mova [dstq+strideq*2], m2 717 mova [dstq+r6 ], m3 718 lea dstq, [dstq+strideq*4] 719 add hq, 4 720 jl .w16_loop 721 RET 722.w32: 723 WIN64_SPILL_XMM 7 724 movu m4, [tlq+ 2] 725 movu m6, [tlq+34] 726 psubw m4, m5 727 psubw m6, m5 728.w32_loop: 729 vpbroadcastw m1, [weightsq+hq*2+0] 730 vpbroadcastw m3, [weightsq+hq*2+2] 731 pmulhrsw m0, m4, m1 732 pmulhrsw m1, m6 733 pmulhrsw m2, m4, m3 734 pmulhrsw m3, m6 735 REPX {paddw x, m5}, m0, m1, m2, m3 736 mova [dstq+strideq*0+32*0], m0 737 mova [dstq+strideq*0+32*1], m1 738 mova [dstq+strideq*1+32*0], m2 739 mova [dstq+strideq*1+32*1], m3 740 lea dstq, [dstq+strideq*2] 741 add hq, 2 742 jl .w32_loop 743 RET 744.w64: 745 WIN64_SPILL_XMM 8 746 movu m3, [tlq+ 2] 747 movu m4, [tlq+34] 748 movu m6, [tlq+66] 749 movu m7, [tlq+98] 750 REPX {psubw x, m5}, m3, m4, m6, m7 751.w64_loop: 752 vpbroadcastw m2, [weightsq+hq*2] 753 pmulhrsw m0, m3, m2 754 pmulhrsw m1, m4, m2 755 paddw m0, m5 756 paddw m1, m5 757 mova [dstq+32*0], m0 758 pmulhrsw m0, m6, m2 759 mova [dstq+32*1], m1 760 pmulhrsw m1, m7, m2 761 paddw m0, m5 762 paddw m1, m5 763 mova [dstq+32*2], m0 764 mova [dstq+32*3], m1 765 add dstq, strideq 766 inc hq 767 jl .w64_loop 768 RET 769 770cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 771%define base r6-ipred_smooth_h_16bpc_avx2_table 772 lea r6, [ipred_smooth_h_16bpc_avx2_table] 773 mov wd, wm 774 movifnidn hd, hm 775 vpbroadcastw m5, [tlq+wq*2] ; right 776 tzcnt wd, wd 777 add hd, hd 778 movsxd wq, [r6+wq*4] 779 sub tlq, hq 780 lea stride3q, [strideq*3] 781 add wq, r6 782 jmp wq 783.w4: 784 vpbroadcastq m4, [base+smooth_weights_1d_16bpc+4*2] 785 movsldup m3, [base+ipred_hv_shuf] 786.w4_loop: 787 vpbroadcastq m0, [tlq+hq-8] ; left 788 pshufb m0, m3 789 psubw m0, m5 ; left - right 790 pmulhrsw m0, m4 791 paddw m0, m5 792 vextracti128 xm1, m0, 1 793 movq [dstq+strideq*0], xm0 794 movq [dstq+strideq*1], xm1 795 movhps [dstq+strideq*2], xm0 796 movhps [dstq+stride3q ], xm1 797 lea dstq, [dstq+strideq*4] 798 sub hd, 4*2 799 jg .w4_loop 800 RET 801.w8: 802 vbroadcasti128 m4, [base+smooth_weights_1d_16bpc+8*2] 803 movsldup m3, [base+ipred_hv_shuf] 804.w8_loop: 805 vpbroadcastd m0, [tlq+hq-4] 806 vpbroadcastd m1, [tlq+hq-8] 807 pshufb m0, m3 808 pshufb m1, m3 809 psubw m0, m5 810 psubw m1, m5 811 pmulhrsw m0, m4 812 pmulhrsw m1, m4 813 paddw m0, m5 814 paddw m1, m5 815 mova [dstq+strideq*0], xm0 816 vextracti128 [dstq+strideq*1], m0, 1 817 mova [dstq+strideq*2], xm1 818 vextracti128 [dstq+stride3q ], m1, 1 819 lea dstq, [dstq+strideq*4] 820 sub hq, 4*2 821 jg .w8_loop 822 RET 823.w16: 824 movu m4, [base+smooth_weights_1d_16bpc+16*2] 825.w16_loop: 826 vpbroadcastq m3, [tlq+hq-8] 827 punpcklwd m3, m3 828 psubw m3, m5 829 pshufd m0, m3, q3333 830 pshufd m1, m3, q2222 831 pshufd m2, m3, q1111 832 pshufd m3, m3, q0000 833 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 834 REPX {paddw x, m5}, m0, m1, m2, m3 835 mova [dstq+strideq*0], m0 836 mova [dstq+strideq*1], m1 837 mova [dstq+strideq*2], m2 838 mova [dstq+stride3q ], m3 839 lea dstq, [dstq+strideq*4] 840 sub hq, 4*2 841 jg .w16_loop 842 RET 843.w32: 844 WIN64_SPILL_XMM 7 845 movu m4, [base+smooth_weights_1d_16bpc+32*2] 846 movu m6, [base+smooth_weights_1d_16bpc+32*3] 847.w32_loop: 848 vpbroadcastw m1, [tlq+hq-2] 849 vpbroadcastw m3, [tlq+hq-4] 850 psubw m1, m5 851 psubw m3, m5 852 pmulhrsw m0, m4, m1 853 pmulhrsw m1, m6 854 pmulhrsw m2, m4, m3 855 pmulhrsw m3, m6 856 REPX {paddw x, m5}, m0, m1, m2, m3 857 mova [dstq+strideq*0+32*0], m0 858 mova [dstq+strideq*0+32*1], m1 859 mova [dstq+strideq*1+32*0], m2 860 mova [dstq+strideq*1+32*1], m3 861 lea dstq, [dstq+strideq*2] 862 sub hq, 2*2 863 jg .w32_loop 864 RET 865.w64: 866 WIN64_SPILL_XMM 8 867 movu m3, [base+smooth_weights_1d_16bpc+32*4] 868 movu m4, [base+smooth_weights_1d_16bpc+32*5] 869 movu m6, [base+smooth_weights_1d_16bpc+32*6] 870 movu m7, [base+smooth_weights_1d_16bpc+32*7] 871.w64_loop: 872 vpbroadcastw m2, [tlq+hq-2] 873 psubw m2, m5 874 pmulhrsw m0, m3, m2 875 pmulhrsw m1, m4, m2 876 paddw m0, m5 877 paddw m1, m5 878 mova [dstq+32*0], m0 879 pmulhrsw m0, m6, m2 880 mova [dstq+32*1], m1 881 pmulhrsw m1, m7, m2 882 paddw m0, m5 883 paddw m1, m5 884 mova [dstq+32*2], m0 885 mova [dstq+32*3], m1 886 add dstq, strideq 887 sub hq, 1*2 888 jg .w64_loop 889 RET 890 891%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] 892 pmaddwd m0, m%1, m%3 893 pmaddwd m1, m%2, m%4 894 paddd m0, m%5 895 paddd m1, m%6 896 psrld m0, 8 897 psrld m1, 8 898 packssdw m0, m1 899 pavgw m0, m5 900%endmacro 901 902cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights 903%define base r6-ipred_smooth_16bpc_avx2_table 904 lea r6, [ipred_smooth_16bpc_avx2_table] 905 mov wd, wm 906 vpbroadcastw m4, [tlq+wq*2] ; right 907 tzcnt wd, wd 908 mov hd, hm 909 sub tlq, hq 910 sub tlq, hq 911 movsxd wq, [r6+wq*4] 912 pxor m5, m5 913 add wq, r6 914 lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*4] 915 jmp wq 916.w4: 917 WIN64_SPILL_XMM 11 918 vpbroadcastw m0, [tlq] ; bottom 919 vpbroadcastq m6, [tlq+hq*2+2] 920 movsldup m7, [base+ipred_hv_shuf] 921 movshdup m9, [base+ipred_hv_shuf] 922 vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+4*4] 923 punpcklwd m6, m0 ; top, bottom 924 punpcklqdq m8, m9, m9 925 punpckhqdq m9, m9 926 lea r3, [strideq*3] 927.w4_loop: 928 vpbroadcastq m3, [tlq+hq*2-8] 929 vbroadcasti128 m1, [v_weightsq] 930 pshufb m3, m7 931 punpcklwd m2, m3, m4 ; left, right 932 punpckhwd m3, m4 933 pmaddwd m2, m10 934 pmaddwd m3, m10 935 pshufb m0, m1, m8 936 pshufb m1, m9 937 SMOOTH_2D_END 0, 1, 6, 6, 2, 3 938 vextracti128 xm1, m0, 1 939 movq [dstq+strideq*0], xm0 940 movq [dstq+strideq*1], xm1 941 movhps [dstq+strideq*2], xm0 942 movhps [dstq+r3 ], xm1 943 lea dstq, [dstq+strideq*4] 944 add v_weightsq, 16 945 sub hd, 4 946 jg .w4_loop 947 RET 948.w8: 949 WIN64_SPILL_XMM 12 950 vpbroadcastw m0, [tlq] ; bottom 951 vbroadcasti128 m7, [tlq+hq*2+2] 952 movsldup m8, [base+ipred_hv_shuf] 953 movshdup m9, [base+ipred_hv_shuf] 954 vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+8*4+16*0] 955 vbroadcasti128 m11, [base+smooth_weights_2d_16bpc+8*4+16*1] 956 punpcklwd m6, m7, m0 ; top, bottom 957 punpckhwd m7, m0 958.w8_loop: 959 vpbroadcastd m3, [tlq+hq*2-4] 960 vpbroadcastq m1, [v_weightsq] 961 pshufb m3, m8 962 punpcklwd m2, m3, m4 ; left, right 963 punpckhwd m3, m4 964 pmaddwd m2, m10 965 pmaddwd m3, m11 966 pshufb m1, m9 967 SMOOTH_2D_END 1, 1, 6, 7, 2, 3 968 mova [dstq+strideq*0], xm0 969 vextracti128 [dstq+strideq*1], m0, 1 970 lea dstq, [dstq+strideq*2] 971 add v_weightsq, 8 972 sub hd, 2 973 jg .w8_loop 974 RET 975.w16: 976 WIN64_SPILL_XMM 11 977 vpbroadcastw m0, [tlq] ; bottom 978 movu m7, [tlq+hq*2+2] 979 mova xm8, [base+smooth_weights_2d_16bpc+16*4+16*0] 980 mova xm9, [base+smooth_weights_2d_16bpc+16*4+16*1] 981 vinserti128 m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1 982 vinserti128 m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1 983 punpcklwd m6, m7, m0 ; top, bottom 984 punpckhwd m7, m0 985.w16_loop: 986 vpbroadcastd m3, [tlq+hq*2-4] 987 vpbroadcastd m1, [v_weightsq+0] 988 punpcklwd m3, m4 ; left, right 989 pshufd m2, m3, q1111 990 pmaddwd m10, m8, m2 991 pmaddwd m2, m9 992 pshufd m3, m3, q0000 993 SMOOTH_2D_END 1, 1, 6, 7, 10, 2 994 vpbroadcastd m1, [v_weightsq+4] 995 pmaddwd m2, m8, m3 996 pmaddwd m3, m9 997 mova [dstq+strideq*0], m0 998 SMOOTH_2D_END 1, 1, 6, 7, 2, 3 999 mova [dstq+strideq*1], m0 1000 lea dstq, [dstq+strideq*2] 1001 add v_weightsq, 8 1002 sub hq, 2 1003 jg .w16_loop 1004 RET 1005.w32: 1006 WIN64_SPILL_XMM 15 1007 vpbroadcastw m0, [tlq] ; bottom 1008 movu m7, [tlq+hq*2+ 2] 1009 movu m9, [tlq+hq*2+34] 1010 mova xm10, [base+smooth_weights_2d_16bpc+32*4+16*0] 1011 mova xm11, [base+smooth_weights_2d_16bpc+32*4+16*1] 1012 vinserti128 m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1 1013 vinserti128 m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1 1014 mova xm12, [base+smooth_weights_2d_16bpc+32*4+16*4] 1015 mova xm13, [base+smooth_weights_2d_16bpc+32*4+16*5] 1016 vinserti128 m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1 1017 vinserti128 m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1 1018 punpcklwd m6, m7, m0 1019 punpckhwd m7, m0 1020 punpcklwd m8, m9, m0 1021 punpckhwd m9, m0 1022.w32_loop: 1023 vpbroadcastw m3, [tlq+hq*2-2] 1024 vpbroadcastd m14, [v_weightsq] 1025 punpcklwd m3, m4 1026 pmaddwd m1, m10, m3 1027 pmaddwd m2, m11, m3 1028 pmaddwd m0, m6, m14 1029 paddd m0, m1 1030 pmaddwd m1, m7, m14 1031 paddd m1, m2 1032 pmaddwd m2, m12, m3 1033 pmaddwd m3, m13 1034 psrld m0, 8 1035 psrld m1, 8 1036 packssdw m0, m1 1037 pavgw m0, m5 1038 mova [dstq+32*0], m0 1039 SMOOTH_2D_END 14, 14, 8, 9, 2, 3 1040 mova [dstq+32*1], m0 1041 add dstq, strideq 1042 add v_weightsq, 4 1043 dec hd 1044 jg .w32_loop 1045 RET 1046.w64: 1047 PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base 1048 mov dst_baseq, dstq 1049 mov tl_baseq, tlq 1050 mov v_weights_baseq, v_weightsq 1051 xor xq, xq 1052.w64_loop_x: 1053 mov yq, hq 1054 lea tlq, [tl_baseq+hq*2] 1055 vpbroadcastw m0, [tl_baseq] ; bottom 1056 movu m7, [tlq+xq*2+ 2] 1057 movu m9, [tlq+xq*2+34] 1058 mova xm10, [base+smooth_weights_2d_16bpc+64*4+16*0] 1059 mova xm11, [base+smooth_weights_2d_16bpc+64*4+16*1] 1060 vinserti128 m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1 1061 vinserti128 m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1 1062 mova xm12, [base+smooth_weights_2d_16bpc+64*4+16*4] 1063 mova xm13, [base+smooth_weights_2d_16bpc+64*4+16*5] 1064 vinserti128 m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1 1065 vinserti128 m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1 1066 punpcklwd m6, m7, m0 1067 punpckhwd m7, m0 1068 punpcklwd m8, m9, m0 1069 punpckhwd m9, m0 1070 lea tlq, [tl_baseq-2] 1071.w64_loop_y: 1072 vpbroadcastw m3, [tlq+yq*2] 1073 vpbroadcastd m1, [v_weightsq] 1074 punpcklwd m3, m4 1075 pmaddwd m14, m10, m3 1076 pmaddwd m15, m11, m3 1077 pmaddwd m2, m12, m3 1078 pmaddwd m3, m13 1079 pmaddwd m0, m6, m1 1080 paddd m0, m14 1081 pmaddwd m14, m7, m1 1082 paddd m14, m15 1083 psrld m0, 8 1084 psrld m14, 8 1085 packssdw m0, m14 1086 pavgw m0, m5 1087 mova [dstq+32*0], m0 1088 SMOOTH_2D_END 8, 9, 1, 1, 2, 3 1089 mova [dstq+32*1], m0 1090 add dstq, strideq 1091 add v_weightsq, 4 1092 dec yq 1093 jg .w64_loop_y 1094 lea dstq, [dst_baseq+32*2] 1095 add r6, 16*8 1096 mov v_weightsq, v_weights_baseq 1097 add xq, 32 1098 test xb, 64 1099 jz .w64_loop_x 1100 RET 1101 1102cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase 1103 lea r6, [ipred_z1_16bpc_avx2_table] 1104 tzcnt wd, wm 1105 movifnidn angled, anglem 1106 movifnidn hd, hm 1107 lea r7, [dr_intra_derivative] 1108 movsxd wq, [r6+wq*4] 1109 add tlq, 2 1110 add wq, r6 1111 mov dxd, angled 1112 and dxd, 0x7e 1113 add angled, 165 ; ~90 1114 movzx dxd, word [r7+dxq] 1115 xor angled, 0x4ff ; d = 90 - angle 1116 vpbroadcastd m5, [pw_62] 1117 jmp wq 1118.w4: 1119 ALLOC_STACK -64, 7 1120 cmp angleb, 40 1121 jae .w4_no_upsample 1122 lea r3d, [angleq-1024] 1123 sar r3d, 7 1124 add r3d, hd 1125 jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) 1126 vpbroadcastw xm3, [tlq+14] 1127 movu xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 1128 palignr xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8 1129 paddw xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 1130 add dxd, dxd 1131 palignr xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8 1132 paddw xm2, xm1 ; -1 * a + 9 * b + 9 * c + -1 * d 1133 psubw xm0, xm2, xm0 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 1134 psraw xm0, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 1135 pxor xm4, xm4 1136 paddw xm2, xm0 1137 vpbroadcastw xm0, r8m ; pixel_max 1138 mova [rsp+32], xm3 1139 movd xm3, dxd 1140 pmaxsw xm2, xm4 1141 mov r3d, dxd 1142 pavgw xm2, xm4 1143 vpbroadcastw m3, xm3 1144 pminsw xm2, xm0 1145 punpcklwd xm0, xm1, xm2 1146 punpckhwd xm1, xm2 1147 lea r5, [strideq*3] 1148 pslldq m2, m3, 8 1149 mova [rsp+ 0], xm0 1150 mova [rsp+16], xm1 1151 paddw m6, m3, m3 1152 paddw m3, m2 1153 vpblendd m4, m6, 0xf0 1154 paddw m6, m6 1155 paddw m3, m4 ; xpos0 xpos1 xpos2 xpos3 1156 vbroadcasti128 m4, [z_upsample] 1157.w4_upsample_loop: 1158 lea r2d, [r3+dxq] 1159 shr r3d, 6 ; base0 1160 movu xm1, [rsp+r3*2] 1161 lea r3d, [r2+dxq] 1162 shr r2d, 6 ; base1 1163 movu xm2, [rsp+r2*2] 1164 lea r2d, [r3+dxq] 1165 shr r3d, 6 ; base2 1166 vinserti128 m1, [rsp+r3*2], 1 ; 0 2 1167 lea r3d, [r2+dxq] 1168 shr r2d, 6 ; base3 1169 vinserti128 m2, [rsp+r2*2], 1 ; 1 3 1170 pshufb m1, m4 1171 pshufb m2, m4 1172 punpcklqdq m0, m1, m2 1173 punpckhqdq m1, m2 1174 pand m2, m5, m3 ; frac 1175 psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 1176 psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) 1177 pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) 1178 paddw m3, m6 ; xpos += dx 1179 paddw m0, m1 1180 vextracti128 xm1, m0, 1 1181 movq [dstq+strideq*0], xm0 1182 movhps [dstq+strideq*1], xm0 1183 movq [dstq+strideq*2], xm1 1184 movhps [dstq+r5 ], xm1 1185 lea dstq, [dstq+strideq*4] 1186 sub hd, 4 1187 jg .w4_upsample_loop 1188 RET 1189ALIGN function_align 1190.filter_strength: ; w4/w8/w16 1191%define base r3-z_filter_t0 1192 movd xm0, maxbased 1193 lea r3, [z_filter_t0] 1194 movd xm1, angled 1195 shr angled, 8 ; is_sm << 1 1196 vpbroadcastb m0, xm0 1197 vpbroadcastb m1, xm1 1198 pcmpeqb m0, [base+z_filter_wh] 1199 mova xm2, [r3+angleq*8] 1200 pand m0, m1 1201 pcmpgtb m0, m2 1202 pmovmskb r5d, m0 1203 ret 1204.w4_no_upsample: 1205 mov maxbased, 7 1206 test angled, 0x400 ; !enable_intra_edge_filter 1207 jnz .w4_main 1208 lea maxbased, [hq+3] 1209 call .filter_strength 1210 mov maxbased, 7 1211 test r5d, r5d 1212 jz .w4_main ; filter_strength == 0 1213 popcnt r5d, r5d 1214 vpbroadcastw xm3, [tlq+14] 1215 mova xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 1216 vpbroadcastd xm1, [base+z_filter_k-4+r5*4+12*1] 1217 vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] 1218 palignr xm2, xm3, xm0, 4 ; 2 3 4 5 6 7 8 8 1219 pmullw xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 1220 paddw xm2, xm0 1221 pmullw xm2, xm4 1222 movd [rsp+16], xm3 1223 cmp r5d, 3 1224 jne .w4_3tap 1225 paddw xm1, xm2 1226 palignr xm2, xm3, xm0, 6 ; 3 4 5 6 7 8 8 8 1227 pblendw xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 1228 movzx r3d, word [tlq+14] 1229 movzx r2d, word [tlq+12] 1230 inc maxbased 1231 paddw xm2, xm0 1232 sub r2d, r3d 1233 paddw xm2, xm2 1234 lea r2d, [r2+r3*8+4] 1235 shr r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3 1236 mov [rsp+16], r2w 1237.w4_3tap: 1238 pxor xm0, xm0 1239 paddw xm1, xm2 1240 mov tlq, rsp 1241 psrlw xm1, 3 1242 cmp hd, 8 1243 sbb maxbased, -1 1244 pavgw xm0, xm1 1245 mova [tlq], xm0 1246.w4_main: 1247 movd xm3, dxd 1248 vpbroadcastq m1, [z_base_inc] 1249 vpbroadcastw m6, [tlq+maxbaseq*2] ; top[max_base_x] 1250 shl maxbased, 6 1251 vpbroadcastw m3, xm3 1252 movd xm0, maxbased 1253 mov r3d, dxd ; xpos 1254 vpbroadcastw m0, xm0 1255 paddw m4, m3, m3 1256 psubw m1, m0 ; -max_base_x 1257 vpblendd m3, m4, 0xcc 1258 paddw m0, m4, m3 1259 vpblendd m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3 1260 paddw m4, m4 1261 paddw m3, m1 1262.w4_loop: 1263 lea r5d, [r3+dxq] 1264 shr r3d, 6 ; base0 1265 movu xm1, [tlq+r3*2] 1266 lea r3d, [r5+dxq] 1267 shr r5d, 6 ; base1 1268 movu xm2, [tlq+r5*2] 1269 lea r5d, [r3+dxq] 1270 shr r3d, 6 ; base2 1271 vinserti128 m1, [tlq+r3*2], 1 ; 0 2 1272 lea r3d, [r5+dxq] 1273 shr r5d, 6 ; base3 1274 vinserti128 m2, [tlq+r5*2], 1 ; 1 3 1275 punpcklqdq m0, m1, m2 1276 psrldq m1, 2 1277 pslldq m2, 6 1278 vpblendd m1, m2, 0xcc 1279 pand m2, m5, m3 1280 psllw m2, 9 1281 psubw m1, m0 1282 pmulhrsw m1, m2 1283 psraw m2, m3, 15 ; xpos < max_base_x 1284 paddw m3, m4 1285 paddw m0, m1 1286 vpblendvb m0, m6, m0, m2 1287 vextracti128 xm1, m0, 1 1288 movq [dstq+strideq*0], xm0 1289 movhps [dstq+strideq*1], xm0 1290 lea dstq, [dstq+strideq*2] 1291 movq [dstq+strideq*0], xm1 1292 movhps [dstq+strideq*1], xm1 1293 sub hd, 4 1294 jz .w4_end 1295 lea dstq, [dstq+strideq*2] 1296 cmp r3d, maxbased 1297 jb .w4_loop 1298 lea r6, [strideq*3] 1299.w4_end_loop: 1300 movq [dstq+strideq*0], xm6 1301 movq [dstq+strideq*1], xm6 1302 movq [dstq+strideq*2], xm6 1303 movq [dstq+r6 ], xm6 1304 lea dstq, [dstq+strideq*4] 1305 sub hd, 4 1306 jg .w4_end_loop 1307.w4_end: 1308 RET 1309.w8: 1310 ALLOC_STACK -64, 7 1311 lea r3d, [angleq+216] 1312 mov r3b, hb 1313 cmp r3d, 8 1314 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 1315 movu m2, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g _ 1316 movu m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g _ _ 1317 movu m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1318 cmp hd, 4 1319 jne .w8_upsample_h8 ; awkward single-pixel edge case 1320 vpblendd m0, m2, 0x20 ; 3 4 5 6 7 8 9 a b c c _ _ _ _ _ 1321.w8_upsample_h8: 1322 paddw m2, m1 1323 paddw m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1324 add dxd, dxd 1325 psubw m0, m2, m0 1326 psraw m0, 3 1327 pxor m4, m4 1328 paddw m2, m0 1329 vpbroadcastw m0, r8m 1330 movd xm3, dxd 1331 pmaxsw m2, m4 1332 mov r3d, dxd 1333 pavgw m2, m4 1334 vpbroadcastw m3, xm3 1335 pminsw m2, m0 1336 punpcklwd m0, m1, m2 1337 punpckhwd m1, m2 1338 vbroadcasti128 m4, [z_upsample] 1339 mova [rsp+ 0], xm0 1340 mova [rsp+16], xm1 1341 paddw m6, m3, m3 1342 vextracti128 [rsp+32], m0, 1 1343 vextracti128 [rsp+48], m1, 1 1344 vpblendd m3, m6, 0xf0 ; xpos0 xpos1 1345.w8_upsample_loop: 1346 lea r2d, [r3+dxq] 1347 shr r3d, 6 ; base0 1348 movu xm1, [rsp+r3*2] 1349 movu xm2, [rsp+r3*2+16] 1350 lea r3d, [r2+dxq] 1351 shr r2d, 6 ; base1 1352 vinserti128 m1, [rsp+r2*2], 1 1353 vinserti128 m2, [rsp+r2*2+16], 1 1354 pshufb m1, m4 1355 pshufb m2, m4 1356 punpcklqdq m0, m1, m2 1357 punpckhqdq m1, m2 1358 pand m2, m5, m3 1359 psllw m2, 9 1360 psubw m1, m0 1361 pmulhrsw m1, m2 1362 paddw m3, m6 1363 paddw m0, m1 1364 mova [dstq+strideq*0], xm0 1365 vextracti128 [dstq+strideq*1], m0, 1 1366 lea dstq, [dstq+strideq*2] 1367 sub hd, 2 1368 jg .w8_upsample_loop 1369 RET 1370.w8_no_intra_edge_filter: 1371 and maxbased, 7 1372 or maxbased, 8 ; imin(h+7, 15) 1373 jmp .w8_main 1374.w8_no_upsample: 1375 lea maxbased, [hq+7] 1376 test angled, 0x400 1377 jnz .w8_no_intra_edge_filter 1378 call .filter_strength 1379 test r5d, r5d 1380 jz .w8_main 1381 popcnt r5d, r5d 1382 vpbroadcastd m1, [base+z_filter_k-4+r5*4+12*1] 1383 vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] 1384 mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1385 movu m2, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1386 pmullw m1, m2 1387 cmp hd, 8 1388 jl .w8_filter_h4 1389 punpckhwd m2, m2 1390 vpblendd m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g 1391 je .w8_filter_end ; 8x4 and 8x8 are always 3-tap 1392 movzx r3d, word [tlq+30] 1393 mov maxbased, 16 1394 mov [rsp+32], r3d 1395 cmp r5d, 3 1396 jne .w8_filter_end 1397 punpcklwd xm6, xm0, xm0 1398 vpblendd m2, [tlq+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g 1399 vpblendd m6, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 1400 movzx r5d, word [tlq+28] 1401 mov [rsp+34], r3w 1402 paddw m2, m6 1403 sub r5d, r3d 1404 inc maxbased 1405 paddw m2, m2 1406 lea r3d, [r5+r3*8+4] 1407 paddw m1, m2 1408 shr r3d, 3 1409 mov [rsp+32], r3w 1410 jmp .w8_filter_end 1411.w8_filter_h4: 1412 pshuflw m3, m2, q3321 1413 vinserti128 m3, [tlq+2], 0 ; 2 3 4 5 6 7 8 9 a b c c _ _ _ _ 1414.w8_filter_end: 1415 paddw m0, m3 1416 pmullw m0, m4 1417 mov tlq, rsp 1418 pxor m2, m2 1419 paddw m0, m1 1420 psrlw m0, 3 1421 pavgw m0, m2 1422 mova [tlq], m0 1423.w8_main: 1424 movd xm3, dxd 1425 vbroadcasti128 m1, [z_base_inc] 1426 vpbroadcastw m6, [tlq+maxbaseq*2] 1427 shl maxbased, 6 1428 vpbroadcastw m3, xm3 1429 movd xm0, maxbased 1430 mov r3d, dxd 1431 vpbroadcastw m0, xm0 1432 paddw m4, m3, m3 1433 psubw m1, m0 1434 vpblendd m3, m4, 0xf0 ; xpos0 xpos1 1435 paddw m3, m1 1436.w8_loop: 1437 lea r5d, [r3+dxq] 1438 shr r3d, 6 1439 movu xm0, [tlq+r3*2] 1440 movu xm1, [tlq+r3*2+2] 1441 lea r3d, [r5+dxq] 1442 shr r5d, 6 1443 vinserti128 m0, [tlq+r5*2], 1 1444 vinserti128 m1, [tlq+r5*2+2], 1 1445 pand m2, m5, m3 1446 psllw m2, 9 1447 psubw m1, m0 1448 pmulhrsw m1, m2 1449 psraw m2, m3, 15 1450 paddw m3, m4 1451 paddw m0, m1 1452 vpblendvb m0, m6, m0, m2 1453 mova [dstq+strideq*0], xm0 1454 vextracti128 [dstq+strideq*1], m0, 1 1455 sub hd, 2 1456 jz .w8_end 1457 lea dstq, [dstq+strideq*2] 1458 cmp r3d, maxbased 1459 jb .w8_loop 1460.w8_end_loop: 1461 mova [dstq+strideq*0], xm6 1462 mova [dstq+strideq*1], xm6 1463 lea dstq, [dstq+strideq*2] 1464 sub hd, 2 1465 jg .w8_end_loop 1466.w8_end: 1467 RET 1468.w16_no_intra_edge_filter: 1469 and maxbased, 15 1470 or maxbased, 16 ; imin(h+15, 31) 1471 jmp .w16_main 1472.w16: 1473 ALLOC_STACK -96, 7 1474 lea maxbased, [hq+15] 1475 test angled, 0x400 1476 jnz .w16_no_intra_edge_filter 1477 call .filter_strength 1478 test r5d, r5d 1479 jz .w16_main 1480 popcnt r5d, r5d 1481 mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1482 paddw m1, m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1483 cmp r5d, 3 1484 jne .w16_filter_3tap 1485 vpbroadcastd m2, [base+pw_3] 1486 punpcklwd xm0, xm0 1487 vpblendd m0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 1488 paddw m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1489 paddw m0, m2 1490 pavgw m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i 1491 paddw m0, m1 1492 psrlw m0, 2 1493 movu m3, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1494 paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1495 paddw m1, m3, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1496 cmp hd, 8 1497 jl .w16_filter_5tap_h4 1498 punpckhwd m3, m3 1499 je .w16_filter_5tap_h8 1500 vpblendd m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h 1501 vpblendd m3, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h 1502 movzx r3d, word [tlq+62] 1503 movzx r2d, word [tlq+60] 1504 pavgw m2, m4 1505 sub r2d, r3d 1506 paddw m1, m3 1507 lea r2d, [r2+r3*8+4] 1508 paddw m1, m2 1509 shr r2d, 3 1510 psrlw m1, 2 1511 mov [rsp+66], r3w 1512 mov [rsp+64], r2w 1513 mov tlq, rsp 1514 mov r3d, 33 1515 cmp hd, 16 1516 cmovg maxbased, r3d 1517 jmp .w16_filter_end2 1518.w16_filter_5tap_h8: 1519 vpblendd xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 1520 vpblendd xm3, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 1521 pavgw xm2, xm4 1522 paddw xm1, xm3 1523 paddw xm1, xm2 1524 psrlw xm1, 2 1525 jmp .w16_filter_end2 1526.w16_filter_5tap_h4: 1527 pshuflw xm4, xm3, q3332 ; 4 5 5 5 1528 pshuflw xm3, xm3, q3321 ; 3 4 5 5 1529 pavgw xm2, xm4 1530 paddw xm1, xm3 1531 paddw xm1, xm2 1532 psrlw xm1, 2 1533 jmp .w16_filter_end2 1534.w16_filter_3tap: 1535 vpbroadcastd m3, [base+z_filter_k-4+r5*4+12*1] 1536 vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] 1537 pmullw m0, m3, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1538 movu m2, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1539 pmullw m1, m4 1540 pmullw m3, m2 1541 paddw m0, m1 1542 cmp hd, 8 1543 je .w16_filter_3tap_h8 1544 jl .w16_filter_3tap_h4 1545 punpckhwd m2, m2 1546 vpblendd m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g 1547 jmp .w16_filter_end 1548.w16_filter_3tap_h4: 1549 pshuflw xm2, xm2, q3321 ; 2 3 4 4 _ _ _ _ 1550 jmp .w16_filter_end 1551.w16_filter_3tap_h8: 1552 psrldq xm2, 2 1553 pshufhw xm2, xm2, q2210 ; 2 3 4 5 6 7 8 8 1554.w16_filter_end: 1555 paddw m2, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1556 pmullw m2, m4 1557 psrlw m0, 3 1558 pxor m1, m1 1559 paddw m2, m3 1560 psrlw m2, 3 1561 pavgw m0, m1 1562 pavgw m1, m2 1563.w16_filter_end2: 1564 mov tlq, rsp 1565 mova [tlq+ 0], m0 1566 mova [tlq+32], m1 1567.w16_main: 1568 movd xm4, dxd 1569 vpbroadcastw m6, [tlq+maxbaseq*2] 1570 shl maxbased, 6 1571 vpbroadcastw m4, xm4 1572 movd xm0, maxbased 1573 mov r3d, dxd 1574 vpbroadcastw m0, xm0 1575 paddw m3, m4, [z_base_inc] 1576 psubw m3, m0 1577.w16_loop: 1578 lea r5d, [r3+dxq] 1579 shr r3d, 6 1580 movu m0, [tlq+r3*2] 1581 movu m1, [tlq+r3*2+2] 1582 lea r3d, [r5+dxq] 1583 shr r5d, 6 1584 pand m2, m5, m3 1585 psllw m2, 9 1586 psubw m1, m0 1587 pmulhrsw m1, m2 1588 psraw m2, m3, 15 1589 paddw m3, m4 1590 paddw m1, m0 1591 movu m0, [tlq+r5*2] 1592 vpblendvb m2, m6, m1, m2 1593 movu m1, [tlq+r5*2+2] 1594 mova [dstq+strideq*0], m2 1595 pand m2, m5, m3 1596 psllw m2, 9 1597 psubw m1, m0 1598 pmulhrsw m1, m2 1599 psraw m2, m3, 15 1600 paddw m3, m4 1601 paddw m0, m1 1602 vpblendvb m0, m6, m0, m2 1603 mova [dstq+strideq*1], m0 1604 sub hd, 2 1605 jz .w16_end 1606 lea dstq, [dstq+strideq*2] 1607 cmp r3d, maxbased 1608 jb .w16_loop 1609.w16_end_loop: 1610 mova [dstq+strideq*0], m6 1611 mova [dstq+strideq*1], m6 1612 lea dstq, [dstq+strideq*2] 1613 sub hd, 2 1614 jg .w16_end_loop 1615.w16_end: 1616 RET 1617.w32: 1618 ALLOC_STACK -160, 8 1619 lea maxbased, [hq+31] 1620 mov r3d, 63 1621 cmp hd, 32 1622 cmova maxbased, r3d 1623 test angled, 0x400 1624 jnz .w32_main 1625 vpbroadcastd m2, [pw_3] 1626 mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1627 punpcklwd xm1, xm0, xm0 1628 vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 1629 paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1630 paddw m1, m2 1631 paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1632 pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i 1633 mov r3, rsp 1634 paddw m0, m1 1635 lea r5d, [maxbaseq-31] 1636 psrlw m0, 2 1637 mova [r3], m0 1638.w32_filter_loop: 1639 mova m0, [tlq+30] 1640 paddw m1, m2, [tlq+28] 1641 add tlq, 32 1642 paddw m0, [tlq+0] 1643 pavgw m1, [tlq+4] 1644 paddw m0, [tlq+2] 1645 add r3, 32 1646 paddw m0, m1 1647 psrlw m0, 2 1648 mova [r3], m0 1649 sub r5d, 16 1650 jg .w32_filter_loop 1651 movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1652 punpckhwd m1, m0, m0 1653 paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1654 paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1655 jl .w32_filter_h8 1656 vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h 1657 vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h 1658 movzx r5d, word [tlq+62] 1659 movzx r2d, word [tlq+60] 1660 pavgw m2, m3 1661 sub r2d, r5d 1662 paddw m0, m1 1663 lea r2d, [r2+r5*8+4] 1664 paddw m0, m2 1665 shr r2d, 3 1666 psrlw m0, 2 1667 mova [r3+32], m0 1668 mov [r3+66], r5w 1669 mov [r3+64], r2w 1670 mov tlq, rsp 1671 mov r3d, 65 1672 cmp hd, 64 1673 cmove maxbased, r3d 1674 jmp .w32_main 1675.w32_filter_h8: 1676 vpblendd xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 1677 vpblendd xm1, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 1678 pavgw xm2, xm3 1679 paddw xm0, xm1 1680 mov tlq, rsp 1681 paddw xm0, xm2 1682 psrlw xm0, 2 1683 mova [r3+32], xm0 1684.w32_main: 1685 movd xm4, dxd 1686 vpbroadcastw m6, [tlq+maxbaseq*2] 1687 shl maxbased, 6 1688 vpbroadcastw m4, xm4 1689 movd xm0, maxbased 1690 mov r5d, dxd 1691 vpbroadcastd m7, [pw_m1024] ; -16 * 64 1692 vpbroadcastw m0, xm0 1693 paddw m3, m4, [z_base_inc] 1694 psubw m3, m0 1695.w32_loop: 1696 mov r3d, r5d 1697 shr r3d, 6 1698 movu m0, [tlq+r3*2] 1699 movu m1, [tlq+r3*2+2] 1700 pand m2, m5, m3 1701 psllw m2, 9 1702 psubw m1, m0 1703 pmulhrsw m1, m2 1704 paddw m0, m1 1705 psraw m1, m3, 15 1706 vpblendvb m0, m6, m0, m1 1707 mova [dstq+32*0], m0 1708 movu m0, [tlq+r3*2+32] 1709 movu m1, [tlq+r3*2+34] 1710 add r5d, dxd 1711 psubw m1, m0 1712 pmulhrsw m1, m2 1713 pcmpgtw m2, m7, m3 1714 paddw m3, m4 1715 paddw m0, m1 1716 vpblendvb m0, m6, m0, m2 1717 mova [dstq+32*1], m0 1718 dec hd 1719 jz .w32_end 1720 add dstq, strideq 1721 cmp r5d, maxbased 1722 jb .w32_loop 1723.w32_end_loop: 1724 mova [dstq+32*0], m6 1725 mova [dstq+32*1], m6 1726 add dstq, strideq 1727 dec hd 1728 jg .w32_end_loop 1729.w32_end: 1730 RET 1731.w64: 1732 ALLOC_STACK -256, 10 1733 lea maxbased, [hq+63] 1734 test angled, 0x400 1735 jnz .w64_main 1736 vpbroadcastd m2, [pw_3] 1737 mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1738 punpcklwd xm1, xm0, xm0 1739 vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 1740 paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1741 paddw m1, m2 1742 paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1743 pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i 1744 mov r3, rsp 1745 paddw m0, m1 1746 lea r5d, [hq+32] 1747 psrlw m0, 2 1748 mova [r3], m0 1749.w64_filter_loop: 1750 mova m0, [tlq+30] 1751 paddw m1, m2, [tlq+28] 1752 add tlq, 32 1753 paddw m0, [tlq+0] 1754 pavgw m1, [tlq+4] 1755 paddw m0, [tlq+2] 1756 add r3, 32 1757 paddw m0, m1 1758 psrlw m0, 2 1759 mova [r3], m0 1760 sub r5d, 16 1761 jg .w64_filter_loop 1762 movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1763 punpckhwd m1, m0, m0 1764 paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1765 paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1766 vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h 1767 vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h 1768 pavgw m2, m3 1769 paddw m0, m1 1770 paddw m0, m2 1771 mov tlq, rsp 1772 psrlw m0, 2 1773 mova [r3+32], m0 1774.w64_main: 1775 movd xm4, dxd 1776 vpbroadcastw m6, [tlq+maxbaseq*2] 1777 shl maxbased, 6 1778 vpbroadcastw m4, xm4 1779 movd xm0, maxbased 1780 mov r5d, dxd 1781 vpbroadcastd m7, [pw_m1024] ; -16 * 64 1782 vpbroadcastw m0, xm0 1783 paddw m3, m4, [z_base_inc] 1784 paddw m8, m7, m7 ; -32 * 64 1785 psubw m3, m0 1786 paddw m9, m8, m7 ; -48 * 64 1787.w64_loop: 1788 mov r3d, r5d 1789 shr r3d, 6 1790 movu m0, [tlq+r3*2] 1791 movu m1, [tlq+r3*2+2] 1792 pand m2, m5, m3 1793 psllw m2, 9 1794 psubw m1, m0 1795 pmulhrsw m1, m2 1796 paddw m0, m1 1797 psraw m1, m3, 15 1798 vpblendvb m0, m6, m0, m1 1799 mova [dstq+32*0], m0 1800 movu m0, [tlq+r3*2+32] 1801 movu m1, [tlq+r3*2+34] 1802 psubw m1, m0 1803 pmulhrsw m1, m2 1804 paddw m0, m1 1805 pcmpgtw m1, m7, m3 1806 vpblendvb m0, m6, m0, m1 1807 mova [dstq+32*1], m0 1808 movu m0, [tlq+r3*2+64] 1809 movu m1, [tlq+r3*2+66] 1810 psubw m1, m0 1811 pmulhrsw m1, m2 1812 paddw m0, m1 1813 pcmpgtw m1, m8, m3 1814 vpblendvb m0, m6, m0, m1 1815 mova [dstq+32*2], m0 1816 movu m0, [tlq+r3*2+96] 1817 movu m1, [tlq+r3*2+98] 1818 add r5d, dxd 1819 psubw m1, m0 1820 pmulhrsw m1, m2 1821 pcmpgtw m2, m9, m3 1822 paddw m3, m4 1823 paddw m0, m1 1824 vpblendvb m0, m6, m0, m2 1825 mova [dstq+32*3], m0 1826 dec hd 1827 jz .w64_end 1828 add dstq, strideq 1829 cmp r5d, maxbased 1830 jb .w64_loop 1831.w64_end_loop: 1832 mova [dstq+32*0], m6 1833 mova [dstq+32*1], m6 1834 mova [dstq+32*2], m6 1835 mova [dstq+32*3], m6 1836 add dstq, strideq 1837 dec hd 1838 jg .w64_end_loop 1839.w64_end: 1840 RET 1841 1842cglobal ipred_z2_16bpc, 3, 12, 12, 352, dst, stride, tl, w, h, angle, dx, dy 1843%define base r9-z_filter_t0 1844 lea r9, [ipred_z2_16bpc_avx2_table] 1845 tzcnt wd, wm 1846 movifnidn angled, anglem 1847 movifnidn hd, hm 1848 lea dxq, [dr_intra_derivative-90] 1849 movsxd wq, [r9+wq*4] 1850 mova m1, [tlq- 0] 1851 movzx dyd, angleb 1852 xor angled, 0x400 1853 mova m2, [tlq- 32] 1854 mov r8, dxq 1855 sub dxq, dyq 1856 mova m3, [tlq- 64] 1857 add wq, r9 1858 add r9, z_filter_t0-ipred_z2_16bpc_avx2_table 1859 mova m4, [tlq- 96] 1860 and dyd, ~1 1861 mova m5, [tlq-128] 1862 and dxq, ~1 1863 movzx dyd, word [r8+dyq] ; angle - 90 1864 movzx dxd, word [dxq+270] ; 180 - angle 1865 vpbroadcastd m11, [base+pw_62] 1866 mova [rsp+128], m1 1867 mova [rsp+ 96], m2 1868 mova [rsp+ 64], m3 1869 neg dxd 1870 mova [rsp+ 32], m4 1871 neg dyq 1872 mova [rsp+ 0], m5 1873 jmp wq 1874.w4: 1875 vbroadcasti128 m10, [base+z2_x_shuf] 1876 vpbroadcastq m6, [base+z_base_inc+2] 1877 lea r8d, [dxq+(65<<6)] ; xpos 1878 mov r10d, (63-4)<<6 1879 test angled, 0x400 1880 jnz .w4_main ; !enable_intra_edge_filter 1881 lea r3d, [hq+2] 1882 add angled, 1022 1883 shl r3d, 6 1884 test r3d, angled 1885 jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) 1886 movq xm0, [tlq+2] ; 1 2 3 4 1887 movq xm1, [tlq+0] ; 0 1 2 3 1888 pshuflw xm2, xm0, q3321 ; 2 3 4 4 1889 pshuflw xm3, xm1, q2100 ; 0 0 1 2 1890 vpbroadcastw xm4, r8m ; pixel_max 1891 vbroadcasti128 m10, [base+z_upsample] 1892 paddw xm1, xm0 1893 paddw xm2, xm3 1894 lea r8d, [r8+dxq+(1<<6)] 1895 psubw xm2, xm1, xm2 1896 add dxd, dxd 1897 psraw xm2, 3 1898 pxor xm3, xm3 1899 sub r10d, 3<<6 1900 paddw xm1, xm2 1901 paddw m6, m6 1902 pmaxsw xm1, xm3 1903 sub angled, 1075 ; angle - 53 1904 pavgw xm1, xm3 1905 lea r3d, [hq+3] 1906 pminsw xm1, xm4 1907 xor angled, 0x7f ; 180 - angle 1908 punpcklwd xm1, xm0 1909 movu [rsp+130], xm1 1910 call .filter_strength 1911 jmp .w4_filter_left 1912ALIGN function_align 1913.filter_strength: 1914 movd xm8, r3d 1915 mov r3d, angled 1916 movd xm7, angled 1917 vpbroadcastb m8, xm8 1918 shr r3d, 8 ; is_sm << 1 1919 vpbroadcastb m7, xm7 1920 pcmpeqb m8, [base+z_filter_wh] 1921 mova xm9, [r9+r3*8] 1922 pand m0, m8, m7 1923 pcmpgtb m0, m9 1924 pmovmskb r3d, m0 1925 ret 1926ALIGN function_align 1927.upsample_left: ; h4/h8 1928 mova xm0, [tlq-16] ; 8 7 6 5 4 3 2 1 1929 movu xm1, [tlq-14] ; 7 6 5 4 3 2 1 0 1930 vpbroadcastw xm4, r8m ; pixel_max 1931 cmp hd, 8 1932 je .upsample_left_h8 1933 pshufhw xm2, xm0, q2100 ; _ _ _ _ 4 4 3 2 1934 pshufhw xm3, xm1, q3321 ; _ _ _ _ 2 1 0 0 1935 jmp .upsample_left_end 1936.upsample_left_h8: 1937 pblendw xm2, xm0, [tlq-18], 0xfe ; 8 8 7 6 5 4 3 2 1938 pblendw xm3, xm1, [tlq-12], 0x7f ; 6 5 4 3 2 1 0 0 1939.upsample_left_end: 1940 paddw xm1, xm0 1941 paddw xm2, xm3 1942 psubw xm2, xm1, xm2 1943 add dyq, dyq 1944 psraw xm2, 3 1945 pxor xm3, xm3 1946 paddw xm1, xm2 1947 pmaxsw xm1, xm3 1948 pavgw xm1, xm3 1949 pminsw xm1, xm4 1950 punpcklwd xm2, xm0, xm1 1951 punpckhwd xm0, xm1 1952 mova [rsp+ 96+gprsize], xm2 1953 mova [rsp+112+gprsize], xm0 1954 ret 1955.w4_no_upsample_above: 1956 lea r3d, [hq+3] 1957 sub angled, 1112 ; angle - 90 1958 call .filter_strength 1959 test r3d, r3d 1960 jz .w4_no_filter_above 1961 popcnt r3d, r3d 1962 vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] 1963 vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] 1964 psrldq xm0, xm1, 2 ; 1 2 3 4 1965 pshuflw xm2, xm1, q2100 ; 0 0 1 2 1966 pmullw xm4, xm0 1967 pshuflw xm3, xm0, q3321 ; 2 3 4 4 1968 paddw xm1, xm3 1969 pshuflw xm3, xm0, q3332 ; 3 4 4 4 1970 pmullw xm1, xm5 1971 vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*2] 1972 paddw xm2, xm3 1973 vpbroadcastd xm3, r6m ; max_width 1974 pmullw xm2, xm5 1975 packssdw xm3, xm3 1976 paddw xm1, xm4 1977 paddw xm1, xm2 1978 psubw xm3, [base+pw_1to16] 1979 pxor xm4, xm4 1980 psrlw xm1, 3 1981 pminsw xm3, xm11 ; clip to byte range since there's no variable word blend 1982 pavgw xm1, xm4 1983 vpblendvb xm1, xm0, xm3 1984 movq [rsp+130], xm1 1985.w4_no_filter_above: 1986 lea r3d, [hq+2] 1987 add angled, 973 ; angle + 883 1988 shl r3d, 6 1989 test r3d, angled 1990 jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) 1991 vpbroadcastd xm0, [base+pb_90] 1992 psubb xm0, xm7 ; 180 - angle 1993 pand xm0, xm8 ; reuse from previous filter_strength call 1994 pcmpgtb xm0, xm9 1995 pmovmskb r3d, xm0 1996.w4_filter_left: 1997 test r3d, r3d 1998 jz .w4_main 1999 popcnt r3d, r3d 2000 mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2001 vpbroadcastd m5, r7m ; max_height 2002 cmp r3d, 3 2003 je .w4_filter_left_s3 2004 vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] 2005 vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] 2006 pmullw m2, m0 2007 cmp hd, 8 2008 jl .w4_filter_left_h4 2009 movu m4, [tlq-34] 2010 punpcklwd m1, m0, m0 2011 vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e 2012 je .w4_filter_left_end 2013 vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 2014 jmp .w4_filter_left_end 2015.w4_upsample_left: 2016 call .upsample_left 2017 mov r11, -16 2018 vbroadcasti128 m9, [base+z_upsample] 2019 jmp .w4_main_upsample_left 2020.w4_filter_left_s3: ; can only be h16 2021 movu m2, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2022 vpbroadcastd m4, [base+pw_3] 2023 paddw m1, m0, m2 2024 punpckhwd m2, m2 2025 vpblendd m2, [tlq-28], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g 2026 punpcklwd xm3, xm0, xm0 2027 paddw m2, m4 2028 vpblendd m4, m3, [tlq-34], 0xfe ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e 2029 vpblendd m3, [tlq-36], 0xfe ; 0 0 0 1 2 3 4 5 6 8 8 9 a b c d 2030 paddw m1, m4 2031 pavgw m2, m3 2032 paddw m1, m2 2033 psrlw m1, 2 2034 jmp .w4_filter_left_end2 2035.w4_filter_left_h4: 2036 pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e 2037.w4_filter_left_end: 2038 paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2039 pmullw m1, m3 2040 paddw m1, m2 2041 pxor m2, m2 2042 psrlw m1, 3 2043 pavgw m1, m2 2044.w4_filter_left_end2: 2045 packssdw m5, m5 2046 psubw m5, [base+pw_16to1] 2047 pminsw m5, m11 2048 vpblendvb m1, m0, m5 2049 mova [rsp+96], m1 2050.w4_main: 2051 vbroadcasti128 m9, [base+z2_x_shuf] 2052 mov r11, -8 2053.w4_main_upsample_left: 2054 movd xm5, dyd 2055 mova m4, [base+z2_y_shuf_h4] 2056 mov r2d, r8d 2057 movd xm0, dxd 2058 vpbroadcastw m5, xm5 2059 rorx r5, dyq, 5 2060 lea r8d, [dyq*3] 2061 pmullw m5, [base+z2_ymul] 2062 rorx r9, dyq, 4 2063 sar dyd, 6 2064 vpbroadcastw m0, xm0 2065 sar r8d, 6 2066 pand m5, m11 ; frac_y 2067 neg dyd 2068 psllw m5, 9 2069 add r5d, dyd 2070 add r8d, dyd 2071 add r9d, dyd 2072 paddw m7, m0, m0 2073 lea dyq, [rsp+dyq*2+126] 2074 vpblendd m0, m7, 0xcc 2075 add dyq, r11 2076 neg r5d 2077 paddw m1, m0, m7 2078 neg r8d 2079 vpblendd m0, m1, 0xf0 ; xpos0 xpos1 xpos2 xpos3 2080 neg r9d 2081 paddw m7, m7 2082 paddw m6, m0 2083.w4_loop: 2084 lea r3d, [r2+dxq] 2085 shr r2d, 6 ; base_x0 2086 movu xm1, [rsp+r2*2] 2087 lea r2d, [r3+dxq] 2088 shr r3d, 6 ; base_x1 2089 movu xm3, [rsp+r3*2] 2090 lea r3d, [r2+dxq] 2091 shr r2d, 6 ; base_x2 2092 vinserti128 m1, [rsp+r2*2], 1 2093 lea r2d, [r3+dxq] 2094 shr r3d, 6 ; base_x3 2095 vinserti128 m3, [rsp+r3*2], 1 2096 pshufb m1, m10 ; a0 a1 a2 a3 A0 A1 A2 A3 2097 pshufb m3, m10 ; b0 b1 b2 b3 B0 B1 B2 B3 2098 pand m2, m11, m6 2099 punpcklqdq m0, m1, m3 2100 punpckhqdq m1, m3 2101 psllw m2, 9 2102 psubw m1, m0 2103 pmulhrsw m1, m2 2104 paddw m0, m1 2105 cmp r3d, 64 2106 jge .w4_toponly 2107 movu xm2, [dyq] 2108 vinserti128 m2, [dyq+r8*2], 1 2109 movu xm3, [dyq+r5*2] 2110 vinserti128 m3, [dyq+r9*2], 1 2111 pshufb m2, m9 2112 pshufb m3, m9 2113 punpckhwd m1, m2, m3 ; a3 b3 a2 b2 a1 b1 a0 b0 2114 punpcklwd m2, m3 2115 psubw m2, m1 2116 pmulhrsw m2, m5 2117 psraw m3, m6, 15 ; base_x < topleft 2118 paddw m1, m2 2119 vpermd m1, m4, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 2120 vpblendvb m0, m1, m3 2121.w4_toponly: 2122 paddw m6, m7 ; xpos += dx 2123 lea r3, [strideq*3] 2124 add dyq, r11 2125 vextracti128 xm1, m0, 1 2126 movq [dstq+strideq*0], xm0 2127 movhps [dstq+strideq*1], xm0 2128 movq [dstq+strideq*2], xm1 2129 movhps [dstq+r3 ], xm1 2130 sub hd, 4 2131 jz .w4_end 2132 lea dstq, [dstq+strideq*4] 2133 cmp r2d, r10d 2134 jge .w4_loop 2135.w4_leftonly_loop: 2136 movu xm1, [dyq] 2137 vinserti128 m1, [dyq+r8*2], 1 2138 movu xm2, [dyq+r5*2] 2139 vinserti128 m2, [dyq+r9*2], 1 2140 add dyq, r11 2141 pshufb m1, m9 2142 pshufb m2, m9 2143 punpckhwd m0, m1, m2 2144 punpcklwd m1, m2 2145 psubw m1, m0 2146 pmulhrsw m1, m5 2147 paddw m0, m1 2148 vpermd m0, m4, m0 2149 vextracti128 xm1, m0, 1 2150 movq [dstq+strideq*0], xm0 2151 movhps [dstq+strideq*1], xm0 2152 movq [dstq+strideq*2], xm1 2153 movhps [dstq+r3 ], xm1 2154 lea dstq, [dstq+strideq*4] 2155 sub hd, 4 2156 jg .w4_leftonly_loop 2157.w4_end: 2158 RET 2159.w8: 2160 mov r10d, hd 2161 test angled, 0x400 2162 jnz .w8_main 2163 lea r3d, [angleq+126] 2164 xor r8d, r8d 2165 mov r3b, hb 2166 cmp r3d, 8 2167 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm 2168 movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 2169 mova xm1, [tlq+0] ; 0 1 2 3 4 5 6 7 2170 pblendw xm2, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 2171 pblendw xm3, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 2172 vpbroadcastw xm4, r8m ; pixel_max 2173 paddw xm1, xm0 2174 paddw xm2, xm3 2175 not r8d 2176 psubw xm2, xm1, xm2 2177 add dxd, dxd 2178 psraw xm2, 3 2179 sub angled, 53 ; angle - 53 2180 pxor xm3, xm3 2181 paddw xm2, xm1 2182 lea r3d, [hq+7] 2183 pmaxsw xm2, xm3 2184 xor angled, 0x7f ; 180 - angle 2185 pavgw xm2, xm3 2186 pminsw xm2, xm4 2187 punpcklwd xm1, xm2, xm0 2188 punpckhwd xm2, xm0 2189 movu [rsp+130], xm1 2190 movu [rsp+146], xm2 2191 call .filter_strength 2192 jmp .w8_filter_left 2193.w8_no_upsample_above: 2194 lea r3d, [hq+7] 2195 sub angled, 90 ; angle - 90 2196 call .filter_strength 2197 test r3d, r3d 2198 jz .w8_no_filter_above 2199 popcnt r3d, r3d 2200 vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] 2201 vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] 2202 vpbroadcastd xm6, [base+z_filter_k-4+r3*4+12*2] 2203 movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 x 2204 pblendw xm2, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 x 2205 pmullw xm4, xm0 2206 pblendw xm3, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 x 2207 paddw xm1, xm3 2208 vpblendd xm3, [tlq+6], 0x07 ; 3 4 5 6 7 8 8 8 x 2209 paddw xm2, xm3 2210 vpbroadcastd xm3, r6m ; max_width 2211 pmullw xm1, xm5 2212 pmullw xm2, xm6 2213 packssdw xm3, xm3 2214 paddw xm1, xm4 2215 paddw xm1, xm2 2216 psubw xm3, [base+pw_1to16] 2217 pxor xm4, xm4 2218 psrlw xm1, 3 2219 pminsw xm3, xm11 2220 pavgw xm1, xm4 2221 vpblendvb xm1, xm0, xm3 2222 movu [rsp+130], xm1 2223.w8_no_filter_above: 2224 lea r3d, [angleq-51] 2225 mov r3b, hb 2226 cmp r3d, 8 2227 jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm 2228 vpbroadcastd m0, [base+pb_90] 2229 psubb m0, m7 2230 pand m0, m8 2231 pcmpgtb m0, m9 2232 pmovmskb r3d, m0 2233.w8_filter_left: 2234 test r3d, r3d 2235 jz .w8_main 2236 popcnt r3d, r3d 2237 cmp r3d, 3 2238 jne .w8_filter_left_s12 2239 vpbroadcastd m6, [base+pw_3] 2240 vpbroadcastd m7, [base+pw_16] 2241 cmp hd, 16 ; flags needed for later 2242 jmp .filter_left_s3b 2243.w8_upsample_left: 2244 call .upsample_left 2245 vbroadcasti128 m7, [base+z2_y_shuf_us] 2246 lea r11, [rsp+118] 2247 mov r8, -8 2248 jmp .w8_main_upsample_left 2249.w16_filter_left_s12: 2250 xor r8d, r8d 2251.w8_filter_left_s12: 2252 mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2253 vpbroadcastd m5, r7m ; max_height 2254 vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] 2255 vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] 2256 pmullw m2, m0 2257 cmp hd, 8 2258 jl .w8_filter_left_h4 2259 movu m4, [tlq-34] 2260 punpcklwd m1, m0, m0 2261 vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e 2262 je .w8_filter_left_end 2263 vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 2264 jmp .w8_filter_left_end 2265.w8_filter_left_h4: 2266 pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e 2267.w8_filter_left_end: 2268 paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2269 pmullw m1, m3 2270 paddw m1, m2 2271 pxor m2, m2 2272 psrlw m1, 3 2273 pavgw m1, m2 2274 packssdw m5, m5 2275 psubw m5, [base+pw_16to1] 2276 pminsw m5, m11 2277 vpblendvb m1, m0, m5 2278 mova [rsp+96], m1 2279 test r8d, r8d 2280 jz .w8_main 2281; upsample_main 2282 vbroadcasti128 m10, [base+z_upsample] 2283 vbroadcasti128 m7, [base+z2_y_shuf] 2284 lea r5, [rsp+120] 2285 movd xm1, dyd 2286 vbroadcasti128 m4, [base+z_base_inc+2] 2287 movd xm2, dxd 2288 vpbroadcastw m1, xm1 2289 vpbroadcastw m2, xm2 2290 mov r7, dstq 2291 paddw m4, m4 2292 pmullw m0, m1, [base+z2_ymul8] 2293 paddw m5, m2, m2 2294 psllw xm1, 3 2295 vpblendd m2, m5, 0xf0 2296 lea r2d, [dxq+(66<<6)] ; xpos 2297 paddw m4, m2 2298 pshufd m6, m0, q2020 2299 psraw xm0, 6 2300 pxor xm1, xm1 2301 psubw xm8, xm1, xm0 2302 pand m6, m11 2303 punpckhwd xm9, xm8, xm1 2304 psllw m6, 9 2305 punpcklwd xm8, xm1 2306.w8_upsample_above_loop: 2307 lea r3d, [r2+dxq] 2308 shr r2d, 6 2309 movu xm1, [rsp+r2*2] 2310 movu xm2, [rsp+r2*2+16] 2311 lea r2d, [r3+dxq] 2312 shr r3d, 6 2313 vinserti128 m1, [rsp+r3*2], 1 2314 vinserti128 m2, [rsp+r3*2+16], 1 2315 pshufb m1, m10 2316 pshufb m2, m10 2317 punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 2318 punpckhqdq m1, m2 2319 pand m2, m11, m4 2320 psubw m1, m0 2321 psllw m2, 9 2322 pmulhrsw m1, m2 2323 paddw m0, m1 2324 cmp r3d, 64 2325 jge .w8_upsample_above_toponly 2326 mova m1, m5 2327 vpgatherdq m3, [r5+xm9*2], m5 2328 mova m5, m1 2329 vpgatherdq m2, [r5+xm8*2], m1 2330 pshufb m3, m7 2331 pshufb m2, m7 2332 punpckldq m1, m2, m3 2333 punpckhdq m2, m3 2334 psubw m2, m1 2335 pmulhrsw m2, m6 2336 paddw m1, m2 2337 vpermq m1, m1, q3120 2338 psraw m2, m4, 15 2339 vpblendvb m0, m1, m2 2340.w8_upsample_above_toponly: 2341 paddw m4, m5 2342 sub r5, 4 2343 mova [dstq+strideq*0], xm0 2344 vextracti128 [dstq+strideq*1], m0, 1 2345 sub hd, 2 2346 jz .w8_ret 2347 lea dstq, [dstq+strideq*2] 2348 jmp .w8_upsample_above_loop 2349.w8_main: 2350 vbroadcasti128 m7, [base+z2_y_shuf] 2351 lea r11, [rsp+120] 2352 mov r8, -4 2353.w8_main_upsample_left: 2354 movd xm1, dyd 2355 vbroadcasti128 m4, [base+z_base_inc+2] 2356 movd xm2, dxd 2357 vpbroadcastw m1, xm1 2358 vpbroadcastw m2, xm2 2359 mov r7, dstq 2360 pmullw m0, m1, [base+z2_ymul8] 2361 paddw m5, m2, m2 2362 psllw xm1, 3 2363 vpblendd m2, m5, 0xf0 ; xpos0 xpos1 2364 lea r9d, [dxq+(65<<6)] ; xpos 2365 paddw m4, m2 2366 movd [rsp+284], xm1 2367.w8_loop0: 2368 mov r2d, r9d 2369 mova [rsp+288], m0 2370 mov r5, r11 2371 mova [rsp+320], m4 2372 pshufd m6, m0, q2020 2373 psraw xm0, 6 2374 pxor xm1, xm1 2375 psubw xm8, xm1, xm0 ; base_y 2376 pand m6, m11 ; frac_y 2377 punpckhwd xm9, xm8, xm1 ; base_y 2 3 6 7 2378 psllw m6, 9 2379 punpcklwd xm8, xm1 ; base_y 0 1 4 5 2380.w8_loop: 2381 lea r3d, [r2+dxq] 2382 shr r2d, 6 ; base_x0 2383 movu xm0, [rsp+r2*2] 2384 movu xm1, [rsp+r2*2+2] 2385 lea r2d, [r3+dxq] 2386 shr r3d, 6 ; base_x1 2387 vinserti128 m0, [rsp+r3*2], 1 2388 vinserti128 m1, [rsp+r3*2+2], 1 2389 pand m2, m11, m4 2390 psubw m1, m0 2391 psllw m2, 9 2392 pmulhrsw m1, m2 2393 paddw m0, m1 2394 cmp r3d, 64 2395 jge .w8_toponly 2396 mova m1, m5 2397 vpgatherdq m3, [r5+xm9*2], m5 2398 mova m5, m1 2399 vpgatherdq m2, [r5+xm8*2], m1 2400 pshufb m3, m7 ; c0 d0 c1 d1 g0 h0 g1 h1 2401 pshufb m2, m7 ; a0 b0 a1 b1 e0 f0 e1 f1 2402 punpckldq m1, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 2403 punpckhdq m2, m3 2404 psubw m2, m1 2405 pmulhrsw m2, m6 2406 paddw m1, m2 2407 vpermq m1, m1, q3120 2408 psraw m2, m4, 15 ; base_x < topleft 2409 vpblendvb m0, m1, m2 2410.w8_toponly: 2411 paddw m4, m5 ; xpos += dx 2412 add r5, r8 2413 mova [dstq+strideq*0], xm0 2414 vextracti128 [dstq+strideq*1], m0, 1 2415 sub hd, 2 2416 jz .w8_end 2417 lea dstq, [dstq+strideq*2] 2418 cmp r2d, (63-8)<<6 2419 jge .w8_loop 2420.w8_leftonly_loop: 2421 mova m0, m5 2422 vpgatherdq m4, [r5+xm9*2], m5 2423 mova m5, m0 2424 vpgatherdq m3, [r5+xm8*2], m0 2425 add r5, r8 2426 pshufb m2, m4, m7 2427 pshufb m1, m3, m7 2428 punpckldq m0, m1, m2 2429 punpckhdq m1, m2 2430 psubw m1, m0 2431 pmulhrsw m1, m6 2432 paddw m0, m1 2433 vpermq m0, m0, q3120 2434 mova [dstq+strideq*0], xm0 2435 vextracti128 [dstq+strideq*1], m0, 1 2436 lea dstq, [dstq+strideq*2] 2437 sub hd, 2 2438 jg .w8_leftonly_loop 2439.w8_end: 2440 sub r10d, 1<<8 2441 jl .w8_ret 2442 vpbroadcastd m0, [rsp+284] 2443 add r7, 16 2444 paddw m0, [rsp+288] ; base_y += 8*dy 2445 add r9d, 8<<6 2446 vpbroadcastd m4, [pw_512] 2447 movzx hd, r10b 2448 paddw m4, [rsp+320] ; base_x += 8*64 2449 mov dstq, r7 2450 jmp .w8_loop0 2451.w8_ret: 2452 RET 2453.w16: 2454 movd xm0, [tlq+32] 2455 lea r10d, [hq+(1<<8)] 2456 movd [rsp+160], xm0 2457 test angled, 0x400 2458 jnz .w8_main 2459 lea r3d, [hq+15] 2460 sub angled, 90 2461 call .filter_strength 2462 test r3d, r3d 2463 jz .w16_no_filter_above 2464 popcnt r3d, r3d 2465 vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] 2466 vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*0] 2467 vpbroadcastd m6, [base+z_filter_k-4+r3*4+12*2] 2468 movu m0, [tlq+2] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2469 punpcklwd xm2, xm1, xm1 2470 vpblendd m2, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 2471 punpckhwd m3, m0, m0 2472 pmullw m4, m0 2473 vpblendd m3, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g 2474 paddw m1, m3 2475 vpblendd m3, [tlq+6], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g 2476 paddw m2, m3 2477 vpbroadcastd m3, r6m ; max_width 2478 pmullw m1, m5 2479 pmullw m2, m6 2480 packssdw m3, m3 2481 paddw m1, m4 2482 paddw m1, m2 2483 psubw m3, [base+pw_1to16] 2484 pxor m4, m4 2485 psrlw m1, 3 2486 pminsw m3, m11 2487 pavgw m1, m4 2488 vpblendvb m1, m0, m3 2489 movu [rsp+130], m1 2490.w16_no_filter_above: 2491 vpbroadcastd m0, [base+pb_90] 2492 psubb m0, m7 2493 pand m0, m8 2494 pcmpgtb m0, m9 2495 pmovmskb r3d, m0 2496 test r3d, r3d 2497 jz .w8_main 2498 popcnt r3d, r3d 2499 cmp r3d, 3 2500 jne .w16_filter_left_s12 2501 vpbroadcastd m6, [base+pw_3] 2502 vpbroadcastd m7, [base+pw_16] 2503 cmp hd, 4 2504 jne .filter_left_s3 2505 movq xm0, [tlq-8] ; 0 1 2 3 2506 movq xm1, [tlq-6] ; 1 2 3 4 2507 vpbroadcastd xm5, r7m ; max_height 2508 movq xm4, [base+pw_16to1+24] ; 4to1 2509 pshuflw xm2, xm0, q2100 ; 0 0 1 2 2510 pshuflw xm3, xm1, q3321 ; 2 3 4 4 2511 paddw xm1, xm0 2512 paddw xm1, xm2 2513 pshuflw xm2, xm0, q1000 ; 0 0 0 1 2514 paddw xm3, xm6 2515 packssdw xm5, xm5 2516 pavgw xm2, xm3 2517 psubw xm5, xm4 2518 paddw xm1, xm2 2519 pminsw xm5, xm11 2520 psrlw xm1, 2 2521 vpblendvb xm1, xm0, xm5 2522 movq [rsp+120], xm1 2523 jmp .w8_main 2524.w32: 2525 mova m2, [tlq+32] 2526 movd xm0, [tlq+64] 2527 lea r10d, [hq+(3<<8)] 2528 mova [rsp+160], m2 2529 movd [rsp+192], xm0 2530 test angled, 0x400 2531 jnz .w8_main 2532 vpbroadcastd m6, [base+pw_3] 2533 vpbroadcastd m0, r6m ; max_width 2534 vpbroadcastd m7, [base+pw_16] 2535 mov r3d, 32 2536 packssdw m0, m0 2537 psubw m0, [base+pw_1to16] 2538 pminsw m8, m0, m11 2539 psubw m9, m8, m7 2540.w32_filter_above: 2541 movu m0, [tlq+2] 2542 punpcklwd xm4, xm1, xm1 2543 paddw m2, m6, [tlq+6] 2544 paddw m1, m0 2545 vpblendd m4, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 2546 paddw m1, [tlq+4] 2547 movu m3, [tlq+r3+2] 2548 paddw m5, m6, [tlq+r3-2] 2549 pavgw m2, m4 2550 punpckhwd m4, m3, m3 2551 paddw m1, m2 2552 vpblendd m2, m4, [tlq+r3+6], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h 2553 vpblendd m4, [tlq+r3+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h 2554 pavgw m2, m5 2555 paddw m5, m3, [tlq+r3] 2556 paddw m4, m5 2557 psrlw m1, 2 2558 paddw m2, m4 2559 vpblendvb m1, m0, m8 2560 psrlw m2, 2 2561 vpblendvb m2, m3, m9 2562 movu [rsp+130], m1 2563 movu [rsp+r3+130], m2 2564.filter_left_s3: 2565 cmp hd, 16 2566 jl .filter_left_s3_h8 ; h8 2567.filter_left_s3b: 2568 mova m0, [tlq-32] ; 2 3 4 5 6 7 8 9 a b c d e f g h 2569 movu m2, [tlq-30] ; 3 4 5 6 7 8 9 a b c d e f g h i 2570 vpbroadcastd m5, r7m ; max_height 2571 paddw m1, m0, m2 2572 punpckhwd m2, m2 2573 mov r3d, hd 2574 vpblendd m2, [tlq-28], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i 2575 packssdw m5, m5 2576 not r3 2577 psubw m5, [base+pw_16to1] 2578 paddw m2, m6 2579 pminsw m8, m11, m5 2580 je .filter_left_s3_end ; h16 2581 paddw m1, [tlq-34] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2582 pavgw m2, [tlq-36] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2583 paddw m1, m2 2584 psrlw m1, 2 2585 vpblendvb m3, m1, m0, m8 2586 mova m0, [tlq-64] ; 2 3 4 5 6 7 8 9 a b c d e f g h 2587 paddw m1, m0, [tlq-62] ; 3 4 5 6 7 8 9 a b c d e f g h i 2588 paddw m2, m6, [tlq-60] ; 4 5 6 7 8 9 a b c d e f g h i j 2589 psubw m8, m7 2590 mova [rsp+96], m3 2591 jnp .filter_left_s3_end ; h32 2592 mova m5, [tlq-96] 2593 paddw m1, [tlq-66] 2594 pavgw m2, [tlq-68] 2595 paddw m1, m2 2596 paddw m4, m5, [tlq-94] 2597 paddw m2, m6, [tlq-92] 2598 psrlw m1, 2 2599 paddw m4, [tlq- 98] 2600 pavgw m2, [tlq-100] 2601 vpblendvb m3, m1, m0, m8 2602 mova m0, [tlq-128] 2603 psubw m8, m7 2604 paddw m4, m2 2605 paddw m1, m0, [tlq-126] 2606 paddw m2, m6, [tlq-124] 2607 psrlw m4, 2 2608 mova [rsp+64], m3 2609 vpblendvb m4, m5, m8 2610 psubw m8, m7 2611 mova [rsp+32], m4 2612.filter_left_s3_end: 2613 punpcklwd xm3, xm0, xm0 2614 vpblendd m4, m3, [tlq+r3*2], 0xfe ; 2 2 3 4 5 6 7 8 9 a b c d e f g 2615 vpblendd m3, [tlq+r3*2-2], 0xfe ; 2 2 2 3 4 5 6 7 8 9 a b c d e f 2616 paddw m1, m4 2617 pavgw m2, m3 2618 paddw m1, m2 2619 psrlw m1, 2 2620 vpblendvb m1, m0, m8 2621 mova [rsp+r3*2+130], m1 2622 jmp .w8_main 2623.filter_left_s3_h8: 2624 mova xm0, [tlq-16] ; 0 1 2 3 4 5 6 7 2625 movu xm3, [tlq-14] ; 1 2 3 4 5 6 7 8 2626 pblendw xm2, xm0, [tlq-18], 0xfe ; 0 0 1 2 3 4 5 6 2627 vpbroadcastd xm5, r7m ; max_height 2628 paddw xm1, xm0, xm3 2629 pblendw xm3, [tlq-12], 0x7f ; 2 3 4 5 6 7 8 8 2630 paddw xm1, xm2 2631 vpblendd xm2, [tlq-20], 0x0e ; 0 0 0 1 2 3 4 5 2632 paddw xm3, xm6 2633 packssdw xm5, xm5 2634 pavgw xm2, xm3 2635 psubw xm5, [base+pw_16to1+16] ; 8to1 2636 paddw xm1, xm2 2637 pminsw xm5, xm11 2638 psrlw xm1, 2 2639 vpblendvb xm1, xm0, xm5 2640 mova [rsp+112], xm1 2641 jmp .w8_main 2642.w64: 2643 mova m2, [tlq+ 32] 2644 mova m3, [tlq+ 64] 2645 mova m4, [tlq+ 96] 2646 movd xm0, [tlq+128] 2647 lea r10d, [hq+(7<<8)] 2648 mova [rsp+160], m2 2649 mova [rsp+192], m3 2650 mova [rsp+224], m4 2651 movd [rsp+256], xm0 2652 test angled, 0x400 2653 jnz .w8_main 2654 vpbroadcastd m6, [base+pw_3] 2655 movu m0, [tlq+34] ; 2 3 4 5 6 7 8 9 a b c d e f g h 2656 paddw m2, m6, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2657 paddw m5, m0, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2658 pavgw m2, [tlq+38] ; 4 5 6 7 8 9 a b c d e f g h h h 2659 paddw m5, [tlq+36] ; 3 4 5 6 7 8 9 a b c d e f g h h 2660 movu m4, [tlq+66] 2661 paddw m3, m6, [tlq+62] 2662 paddw m7, m4, [tlq+64] 2663 pavgw m3, [tlq+70] 2664 paddw m7, [tlq+68] 2665 paddw m2, m5 2666 vpbroadcastd m5, r6m ; max_width 2667 mov r3d, 96 2668 packssdw m5, m5 2669 paddw m3, m7 2670 psubw m5, [base+pw_1to16] 2671 psrlw m2, 2 2672 vpbroadcastd m7, [base+pw_16] 2673 psrlw m3, 2 2674 pminsw m8, m11, m5 2675 psubw m9, m8, m7 2676 vpblendvb m2, m0, m9 2677 psubw m9, m7 2678 vpblendvb m3, m4, m9 2679 psubw m9, m7 2680 movu [rsp+162], m2 2681 movu [rsp+194], m3 2682 jmp .w32_filter_above 2683 2684cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase 2685 lea r6, [ipred_z3_16bpc_avx2_table] 2686 tzcnt hd, hm 2687 movifnidn angled, anglem 2688 lea r7, [dr_intra_derivative+45*2-1] 2689 sub tlq, 2 2690 movsxd hq, [r6+hq*4] 2691 sub angled, 180 2692 add hq, r6 2693 mov dyd, angled 2694 neg dyd 2695 xor angled, 0x400 2696 or dyq, ~0x7e 2697 movzx dyd, word [r7+dyq] 2698 vpbroadcastd m5, [pw_62] 2699 mov org_wd, wd 2700 jmp hq 2701.h4: 2702 ALLOC_STACK -64, 7 2703 lea r7, [strideq*3] 2704 cmp angleb, 40 2705 jae .h4_no_upsample 2706 lea r4d, [angleq-1024] 2707 sar r4d, 7 2708 add r4d, wd 2709 jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) 2710 mova xm2, [tlq-14] ; 0 1 2 3 4 5 6 7 2711 pblendw xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 2712 vpblendd xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 2713 pshufd xm3, xm1, q0000 2714 paddw xm1, xm2 2715 paddw xm0, [tlq-12] ; 1 2 3 4 5 6 7 8 2716 vpbroadcastw xm4, r8m ; pixel_max 2717 add dyd, dyd 2718 psubw xm0, xm1, xm0 2719 mova [rsp+ 0], xm3 2720 movd xm3, dyd 2721 psraw xm0, 3 2722 neg dyd 2723 paddw xm1, xm0 2724 pxor xm0, xm0 2725 lea r2d, [dyq+(16<<6)+63] ; ypos 2726 pmaxsw xm1, xm0 2727 pavgw xm1, xm0 2728 vpbroadcastw m3, xm3 2729 pminsw xm1, xm4 2730 punpckhwd xm0, xm1, xm2 2731 punpcklwd xm1, xm2 2732 paddw m2, m3, m3 2733 mova [rsp+32], xm0 2734 punpcklwd m3, m2 2735 mova [rsp+16], xm1 2736 paddw m4, m2, m2 2737 paddw m2, m3 2738 vpblendd m3, m2, 0xf0 ; ypos0 ypos1 ypos2 ypos3 2739.h4_upsample_loop: 2740 lea r4d, [r2+dyq] 2741 shr r2d, 6 2742 movu xm1, [rsp+r2*2] 2743 lea r2d, [r4+dyq] 2744 shr r4d, 6 2745 movu xm2, [rsp+r4*2] 2746 lea r4d, [r2+dyq] 2747 shr r2d, 6 2748 vinserti128 m1, [rsp+r2*2], 1 2749 lea r2d, [r4+dyq] 2750 shr r4d, 6 2751 vinserti128 m2, [rsp+r4*2], 1 2752 psrld m0, m1, 16 2753 pblendw m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 2754 pslld m2, 16 2755 pblendw m1, m2, 0xaa 2756 pand m2, m5, m3 2757 psllw m2, 9 2758 psubw m1, m0 2759 pmulhrsw m1, m2 2760 paddw m3, m4 2761 paddw m1, m0 2762 vextracti128 xm2, m1, 1 2763 punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 2764 punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 2765 movhps [dstq+strideq*0], xm0 2766 movq [dstq+strideq*1], xm0 2767 movhps [dstq+strideq*2], xm1 2768 movq [dstq+r7 ], xm1 2769 add dstq, 8 2770 sub wd, 4 2771 jg .h4_upsample_loop 2772 RET 2773ALIGN function_align 2774.filter_strength: ; h4/h8/h16 2775%define base r4-z_filter_t0 2776 lea r4, [z_filter_t0] 2777 movd xm0, maxbased 2778 movd xm1, angled 2779 shr angled, 8 ; is_sm << 1 2780 vpbroadcastb m0, xm0 2781 vpbroadcastb m1, xm1 2782 pcmpeqb m0, [base+z_filter_wh] 2783 pand m0, m1 2784 mova xm1, [r4+angleq*8] 2785 pcmpgtb m0, m1 2786 pmovmskb r5d, m0 2787 ret 2788.h4_no_upsample: 2789 mov maxbased, 7 2790 test angled, 0x400 ; !enable_intra_edge_filter 2791 jnz .h4_main 2792 lea maxbased, [wq+3] 2793 call .filter_strength 2794 mov maxbased, 7 2795 test r5d, r5d 2796 jz .h4_main ; filter_strength == 0 2797 popcnt r5d, r5d 2798 mova xm0, [tlq-14] ; 0 1 2 3 4 5 6 7 2799 movu xm3, [tlq-12] ; 1 2 3 4 5 6 7 8 2800 vpbroadcastd xm2, [base+z_filter_k-4+r5*4+12*1] 2801 vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] 2802 pmullw xm2, xm0 2803 pblendw xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 2804 paddw xm1, xm0, xm3 2805 movd [rsp+12], xm0 2806 pmullw xm1, xm4 2807 cmp r5d, 3 2808 jne .h4_filter_3tap 2809 pblendw xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8 2810 vpblendd xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 2811 movzx r4d, word [tlq-14] 2812 movzx r2d, word [tlq-12] 2813 inc maxbased 2814 paddw xm1, xm2 2815 paddw xm0, xm3 2816 sub r2d, r4d 2817 paddw xm2, xm0, xm0 2818 lea r2d, [r2+r4*8+4] 2819 shr r2d, 3 2820 mov [rsp+14], r2w 2821.h4_filter_3tap: 2822 pxor xm0, xm0 2823 paddw xm1, xm2 2824 lea tlq, [rsp+30] 2825 psrlw xm1, 3 2826 cmp wd, 8 2827 sbb maxbased, -1 2828 pavgw xm0, xm1 2829 mova [rsp+16], xm0 2830.h4_main: 2831 movd xm3, dyd 2832 neg maxbaseq 2833 vbroadcasti128 m1, [z_base_inc] 2834 vpbroadcastw m6, [tlq+maxbaseq*2] 2835 shl maxbased, 6 2836 vpbroadcastw m3, xm3 2837 lea r4d, [maxbaseq+3*64] 2838 neg dyq 2839 movd xm2, r4d 2840 sub tlq, 8 2841 lea r4, [dyq+63] ; ypos 2842 punpcklwd m1, m1 2843 paddw m0, m3, m3 2844 vpbroadcastw m2, xm2 2845 punpcklwd m3, m0 2846 paddw m4, m0, m0 2847 paddw m0, m3 2848 psubw m2, m1 2849 vpblendd m3, m0, 0xf0 ; ypos0 ypos1 ypos2 ypos3 2850 or maxbased, 63 2851 paddw m3, m2 2852.h4_loop: 2853 lea r5, [r4+dyq] 2854 sar r4, 6 ; base0 2855 movu xm1, [tlq+r4*2] 2856 lea r4, [r5+dyq] 2857 sar r5, 6 ; base1 2858 movu xm2, [tlq+r5*2] 2859 lea r5, [r4+dyq] 2860 sar r4, 6 ; base2 2861 vinserti128 m1, [tlq+r4*2], 1 2862 lea r4, [r5+dyq] 2863 sar r5, 6 ; base3 2864 vinserti128 m2, [tlq+r5*2], 1 2865 punpckhwd m0, m1, m2 2866 punpcklwd m1, m2 2867 pand m2, m5, m3 2868 palignr m0, m1, 4 ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 2869 psllw m2, 9 2870 psubw m1, m0 2871 pmulhrsw m1, m2 2872 psraw m2, m3, 15 ; ypos < max_base_y 2873 paddw m3, m4 2874 paddw m1, m0 2875 vpblendvb m1, m6, m1, m2 2876 vextracti128 xm2, m1, 1 2877 punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 2878 punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 2879 movhps [dstq+strideq*0], xm0 2880 movq [dstq+strideq*1], xm0 2881 movhps [dstq+strideq*2], xm1 2882 movq [dstq+r7 ], xm1 2883 sub wd, 4 2884 jz .h4_end 2885 add dstq, 8 2886 cmp r4d, maxbased 2887 jg .h4_loop 2888.h4_end_loop: 2889 movq [dstq+strideq*0], xm6 2890 movq [dstq+strideq*1], xm6 2891 movq [dstq+strideq*2], xm6 2892 movq [dstq+r7 ], xm6 2893 add dstq, 8 2894 sub wd, 4 2895 jg .h4_end_loop 2896.h4_end: 2897 RET 2898.h8: 2899 lea r4d, [angleq+216] 2900 ALLOC_STACK -64, 8 2901 mov r4b, wb 2902 lea r7, [strideq*3] 2903 cmp r4d, 8 2904 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 2905 mova m2, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2906 paddw m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6 7 8 9 a b c d e 2907 movu m0, [tlq-34] ; _ _ 0 1 2 3 4 5 6 7 8 9 a b c d 2908 cmp wd, 8 2909 je .h8_upsample_w8 2910 pshufhw xm3, xm2, q1000 2911 vpblendd m0, m3, 0x0f ; _ _ _ _ 4 4 4 5 6 7 8 9 a b c d 2912.h8_upsample_w8: 2913 paddw m0, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2914 vpbroadcastw m4, r8m ; pixel_max 2915 add dyd, dyd 2916 psubw m0, m1, m0 2917 movd xm6, dyd 2918 psraw m0, 3 2919 neg dyd 2920 paddw m1, m0 2921 pxor m0, m0 2922 pmaxsw m1, m0 2923 lea r4d, [dyq+(16<<6)+63] ; ypos 2924 pavgw m1, m0 2925 vpbroadcastw m6, xm6 2926 pminsw m1, m4 2927 punpckhwd m0, m1, m2 2928 punpcklwd m1, m2 2929 vextracti128 [rsp+48], m0, 1 2930 vextracti128 [rsp+32], m1, 1 2931 paddw m7, m6, m6 2932 mova [rsp+16], xm0 2933 mova [rsp+ 0], xm1 2934 punpcklwd m6, m7 ; ypos0 ypos1 2935.h8_upsample_loop: 2936 lea r2d, [r4+dyq] 2937 shr r4d, 6 ; base0 2938 movu m1, [rsp+r4*2] 2939 lea r4d, [r2+dyq] 2940 shr r2d, 6 ; base1 2941 movu m2, [rsp+r2*2] 2942 lea r2d, [r4+dyq] 2943 shr r4d, 6 ; base2 2944 movu m3, [rsp+r4*2] 2945 lea r4d, [r2+dyq] 2946 shr r2d, 6 ; base3 2947 movu m4, [rsp+r2*2] 2948 psrld m0, m1, 16 2949 pblendw m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0 2950 pslld m2, 16 2951 pblendw m1, m2, 0xaa 2952 psrld m2, m3, 16 2953 pblendw m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4 c3 d3 c2 d2 c1 d1 c0 d0 2954 pslld m4, 16 2955 pblendw m3, m4, 0xaa 2956 pand m4, m5, m6 2957 paddw m6, m7 2958 psllw m4, 9 2959 psubw m1, m0 2960 pmulhrsw m1, m4 2961 pand m4, m5, m6 2962 psllw m4, 9 2963 psubw m3, m2 2964 pmulhrsw m3, m4 2965 paddw m6, m7 2966 lea r2, [dstq+strideq*4] 2967 paddw m1, m0 2968 paddw m3, m2 2969 punpckhdq m0, m1, m3 ; a5 b5 c5 d5 a4 b4 c4 d4 a1 b1 c1 d1 a0 b0 c0 d0 2970 punpckldq m1, m3 ; a7 b7 c7 d7 a6 b6 c6 d6 a3 b3 c3 d3 a2 b2 c2 d2 2971 vextracti128 xm2, m0, 1 2972 vextracti128 xm3, m1, 1 2973 movhps [r2 +strideq*0], xm0 2974 movq [r2 +strideq*1], xm0 2975 movhps [r2 +strideq*2], xm1 2976 movq [r2 +r7 ], xm1 2977 movhps [dstq+strideq*0], xm2 2978 movq [dstq+strideq*1], xm2 2979 movhps [dstq+strideq*2], xm3 2980 movq [dstq+r7 ], xm3 2981 add dstq, 8 2982 sub wd, 4 2983 jg .h8_upsample_loop 2984 RET 2985.h8_no_intra_edge_filter: 2986 and maxbased, 7 2987 or maxbased, 8 ; imin(w+7, 15) 2988 jmp .h8_main 2989.h8_no_upsample: 2990 lea maxbased, [wq+7] 2991 test angled, 0x400 2992 jnz .h8_no_intra_edge_filter 2993 call .filter_strength 2994 test r5d, r5d 2995 jz .h8_main 2996 popcnt r5d, r5d 2997 mova m0, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2998 movu m3, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2999 vpbroadcastd m2, [base+z_filter_k-4+r5*4+12*1] 3000 vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] 3001 pmullw m2, m0 3002 cmp wd, 8 3003 jl .h8_filter_w4 3004 punpcklwd xm0, xm0 3005 vpblendd m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 3006 movd [rsp+28], xm0 3007 paddw m1, m3 3008 mov r4d, 16 3009 pmullw m1, m4 3010 cmovg maxbased, r4d 3011 cmp r5d, 3 3012 jne .h8_filter_3tap 3013 punpckhwd m3, m3 3014 vpblendd m0, [tlq-34], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d 3015 vpblendd m3, [tlq-26], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g 3016 movzx r4d, word [tlq-30] 3017 movzx r2d, word [tlq-28] 3018 inc maxbased 3019 paddw m1, m2 3020 paddw m0, m3 3021 sub r2d, r4d 3022 paddw m2, m0, m0 3023 lea r2d, [r2+r4*8+4] 3024 shr r2d, 3 3025 mov [rsp+30], r2w 3026 jmp .h8_filter_3tap 3027.h8_filter_w4: 3028 pshufhw xm1, xm0, q2100 3029 vinserti128 m1, [tlq-16], 1 ; _ _ _ _ 4 4 5 6 7 8 9 a b c d e 3030 paddw m1, m3 3031 pmullw m1, m4 3032.h8_filter_3tap: 3033 pxor m0, m0 3034 paddw m1, m2 3035 lea tlq, [rsp+62] 3036 psrlw m1, 3 3037 pavgw m0, m1 3038 mova [rsp+32], m0 3039.h8_main: 3040 movd xm4, dyd 3041 neg maxbaseq 3042 vbroadcasti128 m1, [z_base_inc] 3043 vpbroadcastw m7, [tlq+maxbaseq*2] 3044 shl maxbased, 6 3045 vpbroadcastw m4, xm4 3046 lea r4d, [maxbaseq+7*64] 3047 neg dyq 3048 movd xm2, r4d 3049 sub tlq, 16 3050 lea r4, [dyq+63] 3051 paddw m6, m4, m4 3052 vpbroadcastw m2, xm2 3053 vpblendd m4, m6, 0xf0 ; ypos0 ypos1 3054 psubw m2, m1 3055 or maxbased, 63 3056 paddw m4, m2 3057.h8_loop: 3058 lea r5, [r4+dyq] 3059 sar r4, 6 ; base0 3060 movu xm0, [tlq+r4*2+2] 3061 movu xm1, [tlq+r4*2] 3062 lea r4, [r5+dyq] 3063 sar r5, 6 ; base1 3064 vinserti128 m0, [tlq+r5*2+2], 1 3065 vinserti128 m1, [tlq+r5*2], 1 3066 lea r5, [r4+dyq] 3067 sar r4, 6 ; base2 3068 pand m3, m5, m4 3069 psllw m3, 9 3070 psubw m1, m0 3071 pmulhrsw m1, m3 3072 psraw m3, m4, 15 3073 paddw m4, m6 3074 paddw m0, m1 3075 movu xm1, [tlq+r4*2+2] 3076 movu xm2, [tlq+r4*2] 3077 lea r4, [r5+dyq] 3078 sar r5, 6 ; base3 3079 vpblendvb m0, m7, m0, m3 3080 vinserti128 m1, [tlq+r5*2+2], 1 3081 vinserti128 m2, [tlq+r5*2], 1 3082 pand m3, m5, m4 3083 psllw m3, 9 3084 psubw m2, m1 3085 pmulhrsw m2, m3 3086 psraw m3, m4, 15 3087 paddw m4, m6 3088 lea r5, [dstq+strideq*4] 3089 paddw m1, m2 3090 vpblendvb m1, m7, m1, m3 3091 punpckhwd m2, m0, m1 ; a3 c3 a2 c2 a1 c1 a0 c0 b3 d3 b2 d2 b1 d1 b0 d0 3092 vextracti128 xm3, m2, 1 3093 punpcklwd m0, m1 ; a7 c7 a6 c6 a5 c5 a4 c5 b7 d7 b6 d6 b5 d5 b4 d4 3094 punpckhwd xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0 3095 punpcklwd xm2, xm3 ; a3 b3 c3 d3 a2 b2 c2 d2 3096 vextracti128 xm3, m0, 1 3097 movhps [dstq+strideq*0], xm1 3098 movq [dstq+strideq*1], xm1 3099 movhps [dstq+strideq*2], xm2 3100 movq [dstq+r7 ], xm2 3101 punpckhwd xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4 3102 punpcklwd xm0, xm3 ; a7 b7 c7 d7 a6 b6 c6 d6 3103 movhps [r5 +strideq*0], xm1 3104 movq [r5 +strideq*1], xm1 3105 movhps [r5 +strideq*2], xm0 3106 movq [r5 +r7 ], xm0 3107 sub wd, 4 3108 jz .h8_end 3109 add dstq, 8 3110 cmp r4d, maxbased 3111 jg .h8_loop 3112 lea r6, [strideq*5] 3113 lea r2, [strideq+r7*2] ; stride*7 3114 test wd, 4 3115 jz .h8_end_loop 3116 movq [dstq+strideq*0], xm7 3117 movq [dstq+strideq*1], xm7 3118 movq [dstq+strideq*2], xm7 3119 movq [dstq+r7 ], xm7 3120 movq [dstq+strideq*4], xm7 3121 movq [dstq+r6 ], xm7 3122 movq [dstq+r7*2 ], xm7 3123 movq [dstq+r2 ], xm7 3124 add dstq, 8 3125 sub wd, 4 3126 jz .h8_end 3127.h8_end_loop: 3128 mova [dstq+strideq*0], xm7 3129 mova [dstq+strideq*1], xm7 3130 mova [dstq+strideq*2], xm7 3131 mova [dstq+r7 ], xm7 3132 mova [dstq+strideq*4], xm7 3133 mova [dstq+r6 ], xm7 3134 mova [dstq+r7*2 ], xm7 3135 mova [dstq+r2 ], xm7 3136 add dstq, 16 3137 sub wd, 8 3138 jg .h8_end_loop 3139.h8_end: 3140 RET 3141.h16_no_intra_edge_filter: 3142 and maxbased, 15 3143 or maxbased, 16 ; imin(w+15, 31) 3144 jmp .h16_main 3145ALIGN function_align 3146.h16: 3147 ALLOC_STACK -96, 10 3148 lea maxbased, [wq+15] 3149 lea r7, [strideq*3] 3150 test angled, 0x400 3151 jnz .h16_no_intra_edge_filter 3152 call .filter_strength 3153 test r5d, r5d 3154 jz .h16_main ; filter_strength == 0 3155 popcnt r5d, r5d 3156 movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i 3157 paddw m1, m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3158 vpbroadcastd m6, [base+z_filter_k-4+r5*4+12*1] 3159 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] 3160 pmullw m2, m6, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3161 pmullw m1, m7 3162 paddw m1, m2 3163 cmp wd, 8 3164 jg .h16_filter_w16 3165 mova xm3, [tlq-46] ; 0 1 2 3 4 5 6 7 3166 pmullw xm6, xm3 3167 jl .h16_filter_w4 3168 pblendw xm3, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 3169 cmp r5d, 3 3170 jne .h16_filter_w8_3tap 3171 vpblendd xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 3172.h16_filter_w8_5tap: 3173 punpckhwd m0, m0 3174 vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i 3175 paddw xm4, [tlq-42] ; 2 3 4 5 6 7 8 9 3176 paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3177 paddw xm4, xm4 3178 paddw m0, m0 3179 paddw xm6, xm4 3180 paddw m1, m0 3181.h16_filter_w8_3tap: 3182 paddw xm3, [tlq-44] ; 1 2 3 4 5 6 7 8 3183 pmullw xm3, xm7 3184 pxor m0, m0 3185 paddw xm3, xm6 3186 psrlw xm3, 3 3187 pavgw xm3, xm0 3188 mova [rsp+48], xm3 3189 jmp .h16_filter_end 3190.h16_filter_w4: 3191 pshufhw xm3, xm3, q2100 ; _ _ _ _ 4 4 5 6 3192 cmp r5d, 3 3193 jne .h16_filter_w8_3tap 3194 pshufhw xm4, xm3, q2100 ; _ _ _ _ 4 4 4 5 3195 jmp .h16_filter_w8_5tap 3196.h16_filter_w16: 3197 mova m3, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3198 pmullw m6, m3 3199 punpcklwd xm3, xm3 3200 vpblendd m4, m3, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 3201 paddw m4, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3202 mov r4d, 32 3203 cmp wd, 16 3204 cmovg maxbased, r4d 3205 movd [rsp+28], xm3 3206 pmullw m4, m7 3207 cmp r5d, 3 3208 jne .h16_filter_w16_3tap 3209 punpckhwd m0, m0 3210 vpblendd m3, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d 3211 vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i 3212 paddw m3, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3213 paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3214 movzx r4d, word [tlq-62] 3215 movzx r2d, word [tlq-60] 3216 or maxbased, 1 3217 paddw m3, m3 3218 sub r2d, r4d 3219 paddw m0, m0 3220 lea r2d, [r2+r4*8+4] 3221 paddw m4, m3 3222 shr r2d, 3 3223 paddw m1, m0 3224 mov [rsp+30], r2w 3225.h16_filter_w16_3tap: 3226 pxor m0, m0 3227 paddw m4, m6 3228 psrlw m4, 3 3229 pavgw m4, m0 3230 mova [rsp+32], m4 3231.h16_filter_end: 3232 psrlw m1, 3 3233 lea tlq, [rsp+94] 3234 pavgw m1, m0 3235 mova [rsp+64], m1 3236.h16_main: 3237 movd xm8, dyd 3238 neg maxbaseq 3239 vpbroadcastw m9, [tlq+maxbaseq*2] 3240 shl maxbased, 6 3241 vpbroadcastw m8, xm8 3242 lea r4d, [maxbaseq+dyq+15*64] 3243 neg dyq 3244 movd xm7, r4d 3245 sub tlq, 32 3246 lea r4, [dyq+63] 3247 vpbroadcastw m7, xm7 3248 or maxbased, 63 3249 psubw m7, [z_base_inc] 3250.h16_loop: 3251 lea r5, [r4+dyq] 3252 sar r4, 6 ; base0 3253 movu m0, [tlq+r4*2+2] 3254 movu m2, [tlq+r4*2] 3255 lea r4, [r5+dyq] 3256 sar r5, 6 ; base1 3257 movu m1, [tlq+r5*2+2] 3258 movu m3, [tlq+r5*2] 3259 lea r5, [r4+dyq] 3260 sar r4, 6 ; base3 3261 pand m6, m5, m7 3262 psllw m6, 9 3263 psubw m2, m0 3264 pmulhrsw m2, m6 3265 psraw m6, m7, 15 3266 paddw m7, m8 3267 paddw m0, m2 3268 movu m2, [tlq+r4*2+2] 3269 movu m4, [tlq+r4*2] 3270 lea r4, [r5+dyq] 3271 sar r5, 6 ; base3 3272 vpblendvb m0, m9, m0, m6 3273 pand m6, m5, m7 3274 psllw m6, 9 3275 psubw m3, m1 3276 pmulhrsw m3, m6 3277 psraw m6, m7, 15 3278 paddw m7, m8 3279 paddw m1, m3 3280 vpblendvb m1, m9, m1, m6 3281 pand m6, m5, m7 3282 psllw m6, 9 3283 psubw m4, m2 3284 pmulhrsw m4, m6 3285 psraw m6, m7, 15 3286 paddw m7, m8 3287 paddw m2, m4 3288 movu m3, [tlq+r5*2+2] 3289 movu m4, [tlq+r5*2] 3290 vpblendvb m2, m9, m2, m6 3291 pand m6, m5, m7 3292 psllw m6, 9 3293 psubw m4, m3 3294 pmulhrsw m4, m6 3295 psraw m6, m7, 15 3296 paddw m7, m8 3297 lea r5, [dstq+strideq*4] 3298 paddw m3, m4 3299 vpblendvb m3, m9, m3, m6 3300 punpckhwd m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8 a3 b3 a2 b2 a1 b1 a0 b0 3301 punpcklwd m0, m1 ; af bf ae be ad bd ac bc a7 b7 a6 b6 a5 b5 a4 b4 3302 punpckhwd m1, m2, m3 ; cb db ca da c9 d9 c8 d8 c3 d3 c2 d2 c1 d1 c0 d0 3303 punpcklwd m2, m3 ; cf df ce de cd dd cc dc c7 d7 c6 d6 c5 d5 c4 d4 3304 punpckhdq m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8 a1 b1 c1 d1 a0 b0 c0 d0 3305 vextracti128 xm6, m3, 1 3306 punpckldq m4, m1 ; ab bb cb db aa ba ca da a3 b3 c3 d3 a2 b2 c2 d2 3307 punpckhdq m1, m0, m2 ; ad bd cd dd ac bc cc dc a5 b5 c5 d5 a4 b4 c4 d4 3308 punpckldq m0, m2 ; af bf cf df ae be ce de a7 b7 c7 d7 a6 b6 c6 d6 3309 vextracti128 xm2, m4, 1 3310 movhps [dstq+strideq*0], xm6 3311 movq [dstq+strideq*1], xm6 3312 vextracti128 xm6, m1, 1 3313 movhps [dstq+strideq*2], xm2 3314 movq [dstq+r7 ], xm2 3315 vextracti128 xm2, m0, 1 3316 movhps [r5 +strideq*0], xm6 3317 movq [r5 +strideq*1], xm6 3318 movhps [r5 +strideq*2], xm2 3319 movq [r5 +r7 ], xm2 3320 lea r5, [dstq+strideq*8] 3321 movhps [r5 +strideq*0], xm3 3322 movq [r5 +strideq*1], xm3 3323 movhps [r5 +strideq*2], xm4 3324 movq [r5 +r7 ], xm4 3325 lea r5, [r5+strideq*4] 3326 movhps [r5 +strideq*0], xm1 3327 movq [r5 +strideq*1], xm1 3328 movhps [r5 +strideq*2], xm0 3329 movq [r5 +r7 ], xm0 3330 sub wd, 4 3331 jz .h16_end 3332 add dstq, 8 3333 cmp r4d, maxbased 3334 jg .h16_loop 3335 mov hd, 4 3336.h16_end_loop0: 3337 mov r6d, wd 3338 mov r2, dstq 3339 test wb, 4 3340 jz .h16_end_loop 3341 movq [dstq+strideq*0], xm9 3342 movq [dstq+strideq*1], xm9 3343 movq [dstq+strideq*2], xm9 3344 movq [dstq+r7 ], xm9 3345 and r6d, 120 3346 jz .h16_end_w4 3347 add dstq, 8 3348.h16_end_loop: 3349 mova [dstq+strideq*0], xm9 3350 mova [dstq+strideq*1], xm9 3351 mova [dstq+strideq*2], xm9 3352 mova [dstq+r7 ], xm9 3353 add dstq, 16 3354 sub r6d, 8 3355 jg .h16_end_loop 3356.h16_end_w4: 3357 lea dstq, [r2+strideq*4] 3358 dec hd 3359 jg .h16_end_loop0 3360.h16_end: 3361 RET 3362.h32: 3363 ALLOC_STACK -160, 9 3364 lea maxbased, [wq+31] 3365 and maxbased, 31 3366 or maxbased, 32 ; imin(w+31, 63) 3367 test angled, 0x400 3368 jnz .h32_main 3369 vpbroadcastd m2, [pw_3] 3370 movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i 3371 punpckhwd m1, m0, m0 3372 vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i 3373 paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3374 paddw m1, m2 3375 paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3376 pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3377 lea r4, [rsp+128] 3378 paddw m0, m1 3379 lea r5d, [maxbaseq-31] 3380 psrlw m0, 2 3381 mova [r4], m0 3382.h32_filter_loop: 3383 mova m0, [tlq-62] 3384 paddw m1, m2, [tlq-66] 3385 paddw m0, [tlq-64] 3386 pavgw m1, [tlq-58] 3387 paddw m0, [tlq-60] 3388 sub tlq, 32 3389 sub r4, 32 3390 paddw m0, m1 3391 psrlw m0, 2 3392 mova [r4], m0 3393 sub r5d, 16 3394 jg .h32_filter_loop 3395 jl .h32_filter_h8 3396 mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3397 punpcklwd xm1, xm0, xm0 3398 paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3399 paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3400 vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d 3401 vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 3402 movzx r5d, word [tlq-62] 3403 movzx r2d, word [tlq-60] 3404 pavgw m2, m3 3405 sub r2d, r5d 3406 paddw m0, m1 3407 lea r2d, [r2+r5*8+4] 3408 paddw m0, m2 3409 shr r2d, 3 3410 psrlw m0, 2 3411 mova [r4-32], m0 3412 mov [r4-36], r5w 3413 mov [r4-34], r2w 3414 lea tlq, [rsp+158] 3415 mov r4d, 65 3416 cmp wd, 64 3417 cmove maxbased, r4d 3418 jmp .h32_main 3419.h32_filter_h8: 3420 mova xm0, [tlq-46] ; 0 1 2 3 4 5 6 7 3421 pblendw xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 3422 paddw xm2, [tlq-42] ; 2 3 4 5 6 7 8 9 3423 paddw xm0, [tlq-44] ; 1 2 3 4 5 6 7 8 3424 vpblendd xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 3425 lea tlq, [rsp+158] 3426 pavgw xm2, xm3 3427 paddw xm0, xm1 3428 paddw xm0, xm2 3429 psrlw xm0, 2 3430 mova [r4-16], xm0 3431.h32_main: 3432 movd xm6, dyd 3433 neg maxbaseq 3434 vpbroadcastw m7, [tlq+maxbaseq*2] 3435 shl maxbased, 6 3436 vpbroadcastw m6, xm6 3437 lea r4d, [maxbaseq+dyq+15*64] 3438 neg dyq 3439 movd xm4, r4d 3440 vpbroadcastd m8, [pw_m1024] 3441 lea r4, [dyq+63] 3442 vpbroadcastw m4, xm4 3443 or maxbased, 63 3444 psubw m4, [z_base_inc] 3445.h32_loop: 3446 mov r5, r4 3447 sar r5, 6 3448 movu m1, [tlq+r5*2-64] 3449 movu m0, [tlq+r5*2-62] 3450 pand m3, m5, m4 3451 psllw m3, 9 3452 psubw m1, m0 3453 pmulhrsw m1, m3 3454 pcmpgtw m2, m8, m4 3455 paddw m0, m1 3456 vpblendvb m0, m7, m0, m2 3457 movu m2, [tlq+r5*2-32] 3458 movu m1, [tlq+r5*2-30] 3459 add r4, dyq 3460 sub rsp, 64 3461 psubw m2, m1 3462 pmulhrsw m2, m3 3463 psraw m3, m4, 15 3464 paddw m4, m6 3465 mova [rsp+32*0], m0 3466 paddw m1, m2 3467 vpblendvb m1, m7, m1, m3 3468 mova [rsp+32*1], m1 3469 dec wd 3470 jz .h32_transpose 3471 cmp r4d, maxbased 3472 jg .h32_loop 3473.h32_end_loop: 3474 sub rsp, 64 3475 mova [rsp+32*0], m7 3476 mova [rsp+32*1], m7 3477 dec wd 3478 jg .h32_end_loop 3479.h32_transpose: 3480 lea r3, [strideq*3] 3481 lea r4, [strideq*5] 3482 mov r8, dstq 3483 lea r5, [strideq+r3*2] 3484.h32_transpose_loop0: 3485 lea r6, [rsp+32] 3486 lea r2, [r8+org_wq*2-16] 3487.h32_transpose_loop: 3488 mova m0, [r6+64*7] 3489 mova m1, [r6+64*6] 3490 mova m2, [r6+64*5] 3491 mova m3, [r6+64*4] 3492 mova m4, [r6+64*3] 3493 mova m5, [r6+64*2] 3494 mova m6, [r6+64*1] 3495 mova m7, [r6+64*0] 3496 punpckhwd m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0 3497 punpcklwd m0, m1 ; a7 b7 a6 b6 a5 b5 a4 b4 3498 punpckhwd m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0 3499 punpcklwd m2, m3 ; c7 d7 c6 d6 c5 d5 c4 d4 3500 punpckhwd m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0 3501 punpcklwd m4, m5 ; e7 f7 e6 f6 e5 f5 e4 f4 3502 punpckhwd m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0 3503 punpcklwd m6, m7 ; g7 h7 g6 h6 g5 h5 g4 h4 3504 lea dstq, [r2+strideq*8] 3505 sub r6, 32 3506 punpckhdq m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0 3507 punpckldq m8, m1 ; a3 b3 c3 d3 a2 b2 c2 d2 3508 punpckhdq m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0 3509 punpckldq m3, m5 ; e3 f3 g3 h3 e2 f2 g2 h2 3510 punpckhqdq m5, m7, m1 ; 8 0 3511 vextracti128 [r2 +strideq*0], m5, 1 3512 punpcklqdq m7, m1 ; 9 1 3513 mova [dstq+strideq*0], xm5 3514 punpckhqdq m1, m8, m3 ; 10 2 3515 vextracti128 [r2 +strideq*1], m7, 1 3516 punpcklqdq m8, m3 ; 11 3 3517 mova [dstq+strideq*1], xm7 3518 punpckhdq m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4 3519 vextracti128 [r2 +strideq*2], m1, 1 3520 punpckldq m0, m2 ; a7 b7 c7 d7 a6 b6 c6 d6 3521 mova [dstq+strideq*2], xm1 3522 punpckhdq m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4 3523 vextracti128 [r2 +r3 ], m8, 1 3524 punpckldq m4, m6 ; e7 f7 g7 h7 e6 f6 g6 h6 3525 mova [dstq+r3 ], xm8 3526 punpckhqdq m6, m3, m2 ; 12 4 3527 vextracti128 [r2 +strideq*4], m6, 1 3528 punpcklqdq m3, m2 ; 13 5 3529 mova [dstq+strideq*4], xm6 3530 punpckhqdq m2, m0, m4 ; 14 6 3531 vextracti128 [r2 +r4 ], m3, 1 3532 punpcklqdq m0, m4 ; 15 7 3533 mova [dstq+r4 ], xm3 3534 vextracti128 [r2 +r3*2 ], m2, 1 3535 mova [dstq+r3*2 ], xm2 3536 vextracti128 [r2 +r5 ], m0, 1 3537 mova [dstq+r5 ], xm0 3538 lea r2, [dstq+strideq*8] 3539 cmp r6, rsp 3540 jae .h32_transpose_loop 3541 add rsp, 64*8 3542 sub org_wd, 8 3543 jg .h32_transpose_loop0 3544.h32_end: 3545 RET 3546.h64: 3547 ALLOC_STACK -256, 10 3548 lea maxbased, [wq+63] 3549 test angled, 0x400 3550 jnz .h64_main 3551 vpbroadcastd m2, [pw_3] 3552 movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i 3553 punpckhwd m1, m0, m0 3554 vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i 3555 paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3556 paddw m1, m2 3557 paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3558 pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3559 lea r4, [rsp+224] 3560 paddw m0, m1 3561 lea r5d, [wq+32] 3562 psrlw m0, 2 3563 mova [r4], m0 3564.h64_filter_loop: 3565 mova m0, [tlq-62] 3566 paddw m1, m2, [tlq-66] 3567 paddw m0, [tlq-64] 3568 pavgw m1, [tlq-58] 3569 paddw m0, [tlq-60] 3570 sub tlq, 32 3571 sub r4, 32 3572 paddw m0, m1 3573 psrlw m0, 2 3574 mova [r4], m0 3575 sub r5d, 16 3576 jg .h64_filter_loop 3577 mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3578 punpcklwd xm1, xm0, xm0 3579 paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3580 paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3581 vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d 3582 vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 3583 lea tlq, [rsp+254] 3584 pavgw m2, m3 3585 paddw m0, m1 3586 paddw m0, m2 3587 psrlw m0, 2 3588 mova [r4-32], m0 3589.h64_main: 3590 neg maxbaseq 3591 movd xm4, dyd 3592 vpbroadcastw m6, [tlq+maxbaseq*2] 3593 shl maxbased, 6 3594 vpbroadcastw m4, xm4 3595 lea r4d, [maxbaseq+dyq+15*64] 3596 neg dyq 3597 vpbroadcastd m7, [pw_m1024] 3598 movd xm3, r4d 3599 lea r4, [dyq+63] 3600 paddw m8, m7, m7 3601 vpbroadcastw m3, xm3 3602 or maxbased, 63 3603 paddw m9, m8, m7 3604 psubw m3, [z_base_inc] 3605.h64_loop: 3606 mov r5, r4 3607 sar r5, 6 3608 movu m1, [tlq+r5*2-128] 3609 movu m0, [tlq+r5*2-126] 3610 pand m2, m5, m3 3611 psllw m2, 9 3612 psubw m1, m0 3613 pmulhrsw m1, m2 3614 sub rsp, 128 3615 paddw m0, m1 3616 pcmpgtw m1, m9, m3 3617 vpblendvb m0, m6, m0, m1 3618 mova [rsp+32*0], m0 3619 movu m1, [tlq+r5*2-96] 3620 movu m0, [tlq+r5*2-94] 3621 psubw m1, m0 3622 pmulhrsw m1, m2 3623 paddw m0, m1 3624 pcmpgtw m1, m8, m3 3625 vpblendvb m0, m6, m0, m1 3626 mova [rsp+32*1], m0 3627 movu m1, [tlq+r5*2-64] 3628 movu m0, [tlq+r5*2-62] 3629 psubw m1, m0 3630 pmulhrsw m1, m2 3631 paddw m0, m1 3632 pcmpgtw m1, m7, m3 3633 vpblendvb m0, m6, m0, m1 3634 mova [rsp+32*2], m0 3635 movu m1, [tlq+r5*2-32] 3636 movu m0, [tlq+r5*2-30] 3637 psubw m1, m0 3638 pmulhrsw m1, m2 3639 add r4, dyq 3640 psraw m2, m3, 15 3641 paddw m3, m4 3642 paddw m0, m1 3643 vpblendvb m0, m6, m0, m2 3644 mova [rsp+32*3], m0 3645 dec wd 3646 jz .h64_transpose 3647 cmp r4d, maxbased 3648 jg .h64_loop 3649.h64_end_loop: 3650 sub rsp, 128 3651 mova [rsp+32*0], m6 3652 mova [rsp+32*1], m6 3653 mova [rsp+32*2], m6 3654 mova [rsp+32*3], m6 3655 dec wd 3656 jg .h64_end_loop 3657.h64_transpose: 3658 lea r2, [strideq*3] 3659 lea r3, [strideq*5] 3660 mov r5, dstq 3661 lea r4, [strideq+r2*2] 3662.h64_transpose_loop0: 3663 lea r6, [rsp+112] 3664 lea dstq, [r5+org_wq*2-32] 3665.h64_transpose_loop: 3666 mova xm0, [r6+128*15] 3667 vinserti128 m0, [r6+128* 7], 1 3668 mova xm1, [r6+128*14] 3669 vinserti128 m1, [r6+128* 6], 1 3670 mova xm2, [r6+128*13] 3671 vinserti128 m2, [r6+128* 5], 1 3672 mova xm3, [r6+128*12] 3673 vinserti128 m3, [r6+128* 4], 1 3674 mova xm4, [r6+128*11] 3675 vinserti128 m4, [r6+128* 3], 1 3676 mova xm5, [r6+128*10] 3677 vinserti128 m5, [r6+128* 2], 1 3678 mova xm6, [r6+128* 9] 3679 vinserti128 m6, [r6+128* 1], 1 3680 mova xm7, [r6+128* 8] 3681 vinserti128 m7, [r6+128* 0], 1 3682 punpckhwd m8, m0, m1 3683 punpcklwd m0, m1 3684 punpckhwd m1, m2, m3 3685 punpcklwd m2, m3 3686 punpckhwd m3, m4, m5 3687 punpcklwd m4, m5 3688 punpckhwd m5, m6, m7 3689 punpcklwd m6, m7 3690 sub r6, 16 3691 punpckhdq m7, m8, m1 3692 punpckldq m8, m1 3693 punpckhdq m1, m3, m5 3694 punpckldq m3, m5 3695 punpckhqdq m5, m7, m1 3696 punpcklqdq m7, m1 3697 punpckhqdq m1, m8, m3 3698 punpcklqdq m8, m3 3699 punpckhdq m3, m0, m2 3700 mova [dstq+strideq*0], m5 3701 punpckldq m0, m2 3702 mova [dstq+strideq*1], m7 3703 punpckhdq m2, m4, m6 3704 mova [dstq+strideq*2], m1 3705 punpckldq m4, m6 3706 mova [dstq+r2 ], m8 3707 punpckhqdq m6, m3, m2 3708 mova [dstq+strideq*4], m6 3709 punpcklqdq m3, m2 3710 mova [dstq+r3 ], m3 3711 punpckhqdq m2, m0, m4 3712 mova [dstq+r2*2 ], m2 3713 punpcklqdq m0, m4 3714 mova [dstq+r4 ], m0 3715 lea dstq, [dstq+strideq*8] 3716 cmp r6, rsp 3717 jae .h64_transpose_loop 3718 add rsp, 128*16 3719 sub org_wd, 16 3720 jg .h64_transpose_loop0 3721.h64_end: 3722 RET 3723 3724%macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax 3725%ifnum %4 3726 pshufb xm%2, xm%4 3727%else 3728 pshufb xm%2, %4 3729%endif 3730 vinserti128 m%2, xm%2, 1 3731 pshufd m%1, m%2, q0000 3732 pmaddwd m%1, m2 3733 pshufd m%3, m%2, q1111 3734 pmaddwd m%3, m3 3735 paddd m%1, m1 3736 paddd m%1, m%3 3737 pshufd m%3, m%2, q2222 3738 pmaddwd m%3, m4 3739 paddd m%1, m%3 3740 pshufd m%3, m%2, q3333 3741 pmaddwd m%3, m5 3742 paddd m%1, m%3 3743 psrad m%1, 4 3744 packusdw m%1, m%1 3745 pminsw m%1, m%5 3746%endmacro 3747 3748%macro FILTER_2BLK 7 ; dst, src, tmp_dst, tmp_src, tmp, shuf, bdmax 3749 pshufb m%2, m%6 3750 vpermq m%4, m%2, q3232 3751 vinserti128 m%2, xm%2, 1 3752 pshufd m%1, m%2, q0000 3753 pshufd m%3, m%4, q0000 3754 pmaddwd m%1, m2 3755 pmaddwd m%3, m2 3756 paddd m%1, m1 3757 paddd m%3, m1 3758 pshufd m%5, m%2, q1111 3759 pmaddwd m%5, m3 3760 paddd m%1, m%5 3761 pshufd m%5, m%4, q1111 3762 pmaddwd m%5, m3 3763 paddd m%3, m%5 3764 pshufd m%5, m%2, q2222 3765 pmaddwd m%5, m4 3766 paddd m%1, m%5 3767 pshufd m%5, m%4, q2222 3768 pmaddwd m%5, m4 3769 paddd m%3, m%5 3770 pshufd m%5, m%2, q3333 3771 pmaddwd m%5, m5 3772 paddd m%1, m%5 3773 pshufd m%5, m%4, q3333 3774 pmaddwd m%5, m5 3775 paddd m%3, m%5 3776 psrad m%1, 4 3777 psrad m%3, 4 3778 packusdw m%1, m%3 3779 pminsw m%1, m%7 3780%endmacro 3781 3782; The ipred_filter SIMD processes 4x2 blocks in the following order which 3783; increases parallelism compared to doing things row by row. One redundant 3784; block is calculated for w8 and w16, two for w32. 3785; w4 w8 w16 w32 3786; 1 1 2 1 2 3 5 1 2 3 5 b c d f 3787; 2 2 3 2 4 5 7 2 4 5 7 c e f h 3788; 3 3 4 4 6 7 9 4 6 7 9 e g h j 3789; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ 3790; 5 8 8 i 3791 3792cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter 3793%define base r6-ipred_filter_16bpc_avx2_table 3794 lea r6, [filter_intra_taps] 3795 tzcnt wd, wm 3796%ifidn filterd, filterm 3797 movzx filterd, filterb 3798%else 3799 movzx filterd, byte filterm 3800%endif 3801 shl filterd, 6 3802 add filterq, r6 3803 lea r6, [ipred_filter_16bpc_avx2_table] 3804 vbroadcasti128 m0, [tlq-6] 3805 movsxd wq, [r6+wq*4] 3806 vpbroadcastd m1, [base+pd_8] 3807 pmovsxbw m2, [filterq+16*0] 3808 pmovsxbw m3, [filterq+16*1] 3809 pmovsxbw m4, [filterq+16*2] 3810 pmovsxbw m5, [filterq+16*3] 3811 add wq, r6 3812 mov hd, hm 3813 jmp wq 3814.w4: 3815 WIN64_SPILL_XMM 10 3816 mova xm8, [base+filter_shuf2] 3817 vpbroadcastw m9, r8m ; bitdepth_max 3818 lea r7, [6+hq*2] 3819 sub tlq, r7 3820 jmp .w4_loop_start 3821.w4_loop: 3822 pinsrq xm0, [tlq+hq*2], 0 3823 lea dstq, [dstq+strideq*2] 3824.w4_loop_start: 3825 FILTER_1BLK 6, 0, 7, 8, 9 3826 vextracti128 xm0, m6, 1 3827 movq [dstq+strideq*0], xm6 3828 movq [dstq+strideq*1], xm0 3829 sub hd, 2 3830 jg .w4_loop 3831 RET 3832ALIGN function_align 3833.w8: 3834 WIN64_SPILL_XMM 16 3835 vbroadcasti128 m14, [base+filter_shuf3] 3836 vpbroadcastw m15, r8m ; bitdepth_max 3837 FILTER_1BLK 10, 0, 7, [base+filter_shuf2], 15 3838 vpermq m6, m10, q1302 ; ____ ____ | ____ 4321 3839 pslldq m8, m0, 4 3840 psrldq m7, m6, 2 3841 psrldq m0, m6, 10 3842 punpcklwd m7, m0 3843 vpblendd m8, m6, 0x33 ; _0__ 4321 | ____ 4321 3844 vpblendd m8, m7, 0x40 ; _056 4321 | ____ 4321 3845 vpblendd m8, [tlq-6], 0x30 ; _056 4321 | ____ 4321 3846 lea r7, [16+hq*2] 3847 sub tlq, r7 3848 jmp .w8_loop_start 3849.w8_loop: 3850 vpermq m8, m9, q1302 ; ____ 4321 | ____ 4321 3851 vpermq m6, m9, q2031 3852 psrldq m0, m6, 2 3853 psrldq m6, 10 3854 punpcklwd m6, m0 3855 vpblendd m8, m7, 0x80 ; _0__ 4321 | ____ 4321 3856 vpblendd m8, m6, 0x40 ; _056 4321 | ____ 4321 3857 mova m10, m9 3858.w8_loop_start: 3859 vpblendd m8, [tlq+hq*2], 0x0C ; _056 4321 | _056 4321 3860 call .main 3861 vpblendd m10, m9, 0xCC 3862 mova [dstq+strideq*0], xm10 3863 vextracti128 [dstq+strideq*1], m10, 1 3864 lea dstq, [dstq+strideq*2] 3865 sub hd, 2 3866 jg .w8_loop 3867 RET 3868ALIGN function_align 3869.w16: 3870 ALLOC_STACK 32, 16 3871 vpbroadcastw m15, r8m ; bitdepth_max 3872 sub hd, 2 3873 TAIL_CALL .w16_main, 0 3874.w16_main: 3875 mova xm10, [base+filter_shuf2] 3876 FILTER_1BLK 13, 0, 6, 10, 15 3877 vpermq m12, m13, q3120 3878 mova xm14, [base+filter_shuf3] 3879 vinserti128 m14, [base+filter_shuf1], 1 3880 vpbroadcastq m0, [tlq+10] 3881 vpblendd m0, [tlq-16], 0x4C ; ___0 4321 | _056 ____ 3882 psrldq m6, m12, 8 3883 vpblendd m0, m6, 0x03 ; ___0 4321 | _056 4321 3884 punpcklwd m6, m12 3885 vpblendd m0, m6, 0x80 ; 56_0 4321 | _056 4321 3886 FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 3887 vpblendd m13, m12, 0xCC 3888 vpermq m12, m12, q2031 ; 6___ 5___ 3889 psrldq xm6, xm12, 2 3890 psrldq xm8, xm12, 12 3891 vpblendd xm6, xm8, 0x01 3892 pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ 3893 FILTER_1BLK 11, 6, 8, 10, 15 3894 vpermq m11, m11, q3120 3895 pshufd m9, m11, q1032 3896 movu m8, [tlq+6] ; __43 210_ | ____ ____ 3897 pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ 3898 pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ 3899 vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 3900 mova [dstq+strideq*0], xm13 3901 vextracti128 [dstq+strideq*1], m13, 1 3902 lea r7, [20+hq*2] 3903 sub tlq, r7 3904 vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 3905 jmp .w16_loop_start 3906.w16_loop: 3907 vpermq m13, m13, q3322 3908 vpermq m11, m9, q2020 3909 vpermq m9, m9, q1302 3910 vpermq m6, m12, q0123 3911 psrldq m7, 4 3912 vpblendd m13, m10, 0xCC 3913 vpblendd m9, m7, 0x40 3914 mova m0, [rsp+8] 3915 mova [dstq+strideq*0], xm13 3916 vextracti128 [dstq+strideq*1], m13, 1 3917.w16_loop_start: 3918 mova m13, m12 3919 vpblendd m0, [tlq+hq*2], 0x0C 3920 psrldq m7, m12, 8 3921 punpcklwd m7, m12 3922 vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 3923 vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 3924 FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 3925 vpermq m12, m10, q2031 3926 mova [rsp+8], m0 3927 psrldq m8, m11, 8 3928 psrldq xm6, xm12, 2 3929 psrldq xm7, xm12, 10 3930 psrldq xm0, xm13, 2 3931 punpcklwd m8, m11 3932 punpcklwd xm7, xm6 3933 vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 3934 vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 3935 vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 3936 call .main 3937 vpermq m8, m11, q3120 3938 vpblendd m6, m8, m9, 0xCC 3939 mova [dstq+strideq*0+16], xm6 3940 vextracti128 [dstq+strideq*1+16], m6, 1 3941 lea dstq, [dstq+strideq*2] 3942 sub hd, 2 3943 jg .w16_loop 3944 vpermq m8, m9, q3120 3945 vextracti128 xm0, m8, 1 ; 4321 ____ 3946 pshufd xm11, xm11, q1032 3947 vpblendd xm0, xm11, 0x02 ; 4321 0___ 3948 psrldq xm6, xm8, 2 3949 psrldq xm7, xm8, 12 3950 pblendw xm0, xm6, 0x4 ; 4321 05__ 3951 pblendw xm0, xm7, 0x2 ; 4321 056_ 3952 FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 3953 vpermq m12, m13, q1302 3954 vpblendd m12, m10, 0xCC 3955 vpblendd m9, m6, 0xCC 3956 mova [dstq+strideq*0+ 0], xm12 3957 mova [dstq+strideq*0+16], xm9 3958 vextracti128 [dstq+strideq*1+ 0], m12, 1 3959 vextracti128 [dstq+strideq*1+16], m9, 1 3960 ret 3961ALIGN function_align 3962.w32: 3963 ALLOC_STACK 64, 16 3964 vpbroadcastw m15, r8m ; bitdepth_max 3965 sub hd, 2 3966 lea r3, [dstq+32] 3967 lea r5d, [hd*2+20] 3968 call .w16_main 3969 mov dstq, r3 3970 lea tlq, [tlq+r5+32] 3971 sub r5d, 20 3972 shr r5d, 1 3973 sub r5d, 2 3974 lea r4, [dstq+strideq*2-2] 3975DEFINE_ARGS dst, stride, tl, stride3, left, h 3976 lea stride3q, [strideq*3] 3977 movu m8, [tlq-6] ; 4321 0___ 3978 mova xm10, [base+filter_shuf2] 3979 pinsrw xm0, xm8, [dstq+strideq*0-2], 2 3980 pinsrw xm0, xm0, [dstq+strideq*1-2], 1 ; 4321 056_ 3981 pinsrw xm9, [leftq+strideq*0], 5 3982 pinsrw xm9, [leftq+strideq*1], 4 3983 FILTER_1BLK 13, 0, 6, 10, 15 3984 vpermq m12, m13, q3120 3985 mova xm14, [base+filter_shuf3] 3986 vinserti128 m14, [base+filter_shuf1], 1 3987 psrldq m6, m12, 8 3988 punpcklwd m7, m6, m12 3989 vpblendd m0, m6, 0x03 ; ___0 ____ | _0__ 4321 3990 vpblendd m0, m7, 0x80 ; 56_0 ____ | _0__ 4321 3991 vpblendd m0, m8, 0x30 ; 56_0 4321 | _0__ 4321 3992 vpblendd m0, m9, 0x04 ; 56_0 4321 | _056 4321 3993 FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 3994 vpblendd m13, m12, 0xCC 3995 pinsrw xm9, [leftq+strideq*2], 3 3996 pinsrw xm9, [leftq+stride3q ], 2 3997 lea leftq, [leftq+strideq*4] 3998 pinsrw xm9, [leftq+strideq*0], 1 3999 pinsrw xm9, [leftq+strideq*1], 0 4000 movq [rsp+32], xm9 4001 mov r7d, 1 4002 pslldq m8, m9, 4 4003 vpblendd m0, m8, 0x0C ; ___0 ____ | _056 ____ 4004 vpermq m12, m12, q2031 ; 6___ 5___ 4005 psrldq xm6, xm12, 2 4006 psrldq xm7, xm12, 12 4007 vpblendd xm6, xm7, 0x01 ; ____ _56_ 4008 pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ 4009 FILTER_1BLK 11, 6, 7, 10, 15 4010 vpermq m11, m11, q3120 4011 pshufd m9, m11, q1032 4012 vbroadcasti128 m8, [tlq+22] ; __43 210_ | ____ ____ 4013 pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ 4014 pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ 4015 vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 4016 mova [dstq+strideq*0], xm13 4017 vextracti128 [dstq+strideq*1], m13, 1 4018 vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 4019 jmp .w32_loop_start 4020.w32_loop_last: 4021 mova m0, [rsp+0] 4022 jmp .w32_loop 4023.w32_loop_left: 4024 mova m0, [rsp+0] 4025 vpblendd m0, [rsp+32+r7*4-12], 0x0C 4026 dec r7d 4027 jg .w32_loop 4028 cmp hd, 2 4029 je .w32_loop 4030 pinsrw xm6, [rsp+32], 6 4031 pinsrw xm6, [leftq+strideq*2], 5 4032 pinsrw xm6, [leftq+stride3q ], 4 4033 lea leftq, [leftq+strideq*4] 4034 pinsrw xm6, [leftq+strideq*0], 3 4035 pinsrw xm6, [leftq+strideq*1], 2 4036 pinsrw xm6, [leftq+strideq*2], 1 4037 pinsrw xm6, [leftq+stride3q ], 0 4038 lea leftq, [leftq+strideq*4] 4039 movu [rsp+36], xm6 4040 pinsrw xm6, [leftq+strideq*0], 1 4041 pinsrw xm6, [leftq+strideq*1], 0 4042 movd [rsp+32], xm6 4043 mov r7d, 4 4044.w32_loop: 4045 vpermq m13, m13, q3322 4046 vpermq m11, m9, q2020 4047 vpermq m9, m9, q1302 4048 vpermq m6, m12, q0123 4049 psrldq m7, 4 4050 vpblendd m13, m10, 0xCC 4051 vpblendd m9, m7, 0x40 ; ___0 4321 | ____ 4321 4052 mova [dstq+strideq*0], xm13 4053 vextracti128 [dstq+strideq*1], m13, 1 4054.w32_loop_start: 4055 mova m13, m12 4056 psrldq m7, m12, 8 4057 punpcklwd m7, m12 4058 vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 4059 vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 4060 FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 4061 vpermq m12, m10, q2031 4062 mova [rsp+0], m0 4063 psrldq m8, m11, 8 4064 psrldq xm6, xm12, 2 4065 psrldq xm7, xm12, 10 4066 psrldq xm0, xm13, 2 4067 punpcklwd m8, m11 4068 punpcklwd xm7, xm6 4069 vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 4070 vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 4071 vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 4072 call .main 4073 vpermq m8, m11, q3120 4074 vpblendd m6, m8, m9, 0xCC 4075 mova [dstq+strideq*0+16], xm6 4076 vextracti128 [dstq+strideq*1+16], m6, 1 4077 lea dstq, [dstq+strideq*2] 4078 sub hd, 2 4079 jg .w32_loop_left 4080 jz .w32_loop_last 4081 vpermq m8, m9, q3120 4082 vextracti128 xm0, m8, 1 ; 4321 ____ 4083 pshufd xm11, xm11, q1032 4084 vpblendd xm0, xm11, 0x02 ; 4321 0___ 4085 psrldq xm6, xm8, 2 4086 psrldq xm7, xm8, 12 4087 pblendw xm0, xm6, 0x4 ; 4321 05__ 4088 pblendw xm0, xm7, 0x2 ; 4321 056_ 4089 FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 4090 vpermq m12, m13, q1302 4091 vpblendd m12, m10, 0xCC 4092 vpblendd m9, m6, 0xCC 4093 mova [dstq+strideq*0+ 0], xm12 4094 mova [dstq+strideq*0+16], xm9 4095 vextracti128 [dstq+strideq*1+ 0], m12, 1 4096 vextracti128 [dstq+strideq*1+16], m9, 1 4097 RET 4098.main: 4099 FILTER_2BLK 9, 8, 6, 7, 0, 14, 15 4100 ret 4101 4102%if WIN64 4103DECLARE_REG_TMP 5 4104%else 4105DECLARE_REG_TMP 7 4106%endif 4107 4108%macro IPRED_CFL 1 ; ac in, unpacked pixels out 4109 psignw m3, m%1, m1 4110 pabsw m%1, m%1 4111 pmulhrsw m%1, m2 4112 psignw m%1, m3 4113 paddw m%1, m0 4114%endmacro 4115 4116cglobal ipred_cfl_top_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha 4117 movifnidn hd, hm 4118 add tlq, 2 4119 movd xm4, wd 4120 pxor m6, m6 4121 vpbroadcastw m7, r7m 4122 pavgw xm4, xm6 4123 tzcnt wd, wd 4124 movd xm5, wd 4125 movu m0, [tlq] 4126 lea t0, [ipred_cfl_left_16bpc_avx2_table] 4127 movsxd r6, [t0+wq*4] 4128 add r6, t0 4129 add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table 4130 movsxd wq, [t0+wq*4] 4131 add wq, t0 4132 movifnidn acq, acmp 4133 jmp r6 4134 4135cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha 4136 mov hd, hm ; zero upper half 4137 sub tlq, hq 4138 movd xm4, hd 4139 sub tlq, hq 4140 pxor m6, m6 4141 vpbroadcastw m7, r7m 4142 pavgw xm4, xm6 4143 tzcnt r6d, hd 4144 movd xm5, r6d 4145 movu m0, [tlq] 4146 lea t0, [ipred_cfl_left_16bpc_avx2_table] 4147 movsxd r6, [t0+r6*4] 4148 add r6, t0 4149 add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table 4150 tzcnt wd, wd 4151 movsxd wq, [t0+wq*4] 4152 add wq, t0 4153 movifnidn acq, acmp 4154 jmp r6 4155.h32: 4156 paddw m0, [tlq+32] 4157.h16: 4158 vextracti128 xm1, m0, 1 4159 paddw xm0, xm1 4160.h8: 4161 psrldq xm1, xm0, 8 4162 paddw xm0, xm1 4163.h4: 4164 punpcklwd xm0, xm6 4165 psrlq xm1, xm0, 32 4166 paddd xm0, xm1 4167 psrldq xm1, xm0, 8 4168 paddd xm0, xm1 4169 paddd xm0, xm4 4170 psrld xm0, xm5 4171 vpbroadcastw m0, xm0 4172 jmp wq 4173 4174cglobal ipred_cfl_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha 4175 movifnidn hd, hm 4176 movifnidn wd, wm 4177 tzcnt r6d, hd 4178 lea t0d, [wq+hq] 4179 movd xm4, t0d 4180 tzcnt t0d, t0d 4181 movd xm5, t0d 4182 lea t0, [ipred_cfl_16bpc_avx2_table] 4183 tzcnt wd, wd 4184 movsxd r6, [t0+r6*4] 4185 movsxd wq, [t0+wq*4+4*4] 4186 psrlw xm4, 1 4187 pxor m6, m6 4188 vpbroadcastw m7, r7m 4189 add r6, t0 4190 add wq, t0 4191 movifnidn acq, acmp 4192 jmp r6 4193.h4: 4194 movq xm0, [tlq-8] 4195 jmp wq 4196.w4: 4197 movq xm1, [tlq+2] 4198 paddw m0, m4 4199 paddw m0, m1 4200 psrlq m1, m0, 32 4201 paddw m0, m1 4202 psrld m1, m0, 16 4203 paddw m0, m1 4204 cmp hd, 4 4205 jg .w4_mul 4206 psrlw xm0, 3 4207 jmp .w4_end 4208.w4_mul: 4209 vextracti128 xm1, m0, 1 4210 paddw xm0, xm1 4211 lea r2d, [hq*2] 4212 mov r6d, 0xAAAB6667 4213 shrx r6d, r6d, r2d 4214 punpckhwd xm1, xm0, xm6 4215 punpcklwd xm0, xm6 4216 paddd xm0, xm1 4217 movd xm1, r6d 4218 psrld xm0, 2 4219 pmulhuw xm0, xm1 4220 psrlw xm0, 1 4221.w4_end: 4222 vpbroadcastw m0, xm0 4223.s4: 4224 vpbroadcastw m1, alpham 4225 lea r6, [strideq*3] 4226 pabsw m2, m1 4227 psllw m2, 9 4228.s4_loop: 4229 mova m4, [acq] 4230 IPRED_CFL 4 4231 pmaxsw m4, m6 4232 pminsw m4, m7 4233 vextracti128 xm5, m4, 1 4234 movq [dstq+strideq*0], xm4 4235 movq [dstq+strideq*2], xm5 4236 movhps [dstq+strideq*1], xm4 4237 movhps [dstq+r6 ], xm5 4238 lea dstq, [dstq+strideq*4] 4239 add acq, 32 4240 sub hd, 4 4241 jg .s4_loop 4242 RET 4243ALIGN function_align 4244.h8: 4245 mova xm0, [tlq-16] 4246 jmp wq 4247.w8: 4248 vextracti128 xm1, m0, 1 4249 paddw xm0, [tlq+2] 4250 paddw xm0, xm4 4251 paddw xm0, xm1 4252 psrld xm1, xm0, 16 4253 paddw xm0, xm1 4254 pblendw xm0, xm6, 0xAA 4255 psrlq xm1, xm0, 32 4256 paddd xm0, xm1 4257 psrldq xm1, xm0, 8 4258 paddd xm0, xm1 4259 psrld xm0, xm5 4260 cmp hd, 8 4261 je .w8_end 4262 mov r6d, 0xAAAB 4263 mov r2d, 0x6667 4264 cmp hd, 32 4265 cmovz r6d, r2d 4266 movd xm1, r6d 4267 pmulhuw xm0, xm1 4268 psrlw xm0, 1 4269.w8_end: 4270 vpbroadcastw m0, xm0 4271.s8: 4272 vpbroadcastw m1, alpham 4273 lea r6, [strideq*3] 4274 pabsw m2, m1 4275 psllw m2, 9 4276.s8_loop: 4277 mova m4, [acq] 4278 mova m5, [acq+32] 4279 IPRED_CFL 4 4280 IPRED_CFL 5 4281 pmaxsw m4, m6 4282 pmaxsw m5, m6 4283 pminsw m4, m7 4284 pminsw m5, m7 4285 mova [dstq+strideq*0], xm4 4286 mova [dstq+strideq*2], xm5 4287 vextracti128 [dstq+strideq*1], m4, 1 4288 vextracti128 [dstq+r6 ], m5, 1 4289 lea dstq, [dstq+strideq*4] 4290 add acq, 64 4291 sub hd, 4 4292 jg .s8_loop 4293 RET 4294ALIGN function_align 4295.h16: 4296 mova m0, [tlq-32] 4297 jmp wq 4298.w16: 4299 paddw m0, [tlq+2] 4300 vextracti128 xm1, m0, 1 4301 paddw xm0, xm4 4302 paddw xm0, xm1 4303 punpckhwd xm1, xm0, xm6 4304 punpcklwd xm0, xm6 4305 paddd xm0, xm1 4306 psrlq xm1, xm0, 32 4307 paddd xm0, xm1 4308 psrldq xm1, xm0, 8 4309 paddd xm0, xm1 4310 psrld xm0, xm5 4311 cmp hd, 16 4312 je .w16_end 4313 mov r6d, 0xAAAB 4314 mov r2d, 0x6667 4315 test hb, 8|32 4316 cmovz r6d, r2d 4317 movd xm1, r6d 4318 pmulhuw xm0, xm1 4319 psrlw xm0, 1 4320.w16_end: 4321 vpbroadcastw m0, xm0 4322.s16: 4323 vpbroadcastw m1, alpham 4324 pabsw m2, m1 4325 psllw m2, 9 4326.s16_loop: 4327 mova m4, [acq] 4328 mova m5, [acq+32] 4329 IPRED_CFL 4 4330 IPRED_CFL 5 4331 pmaxsw m4, m6 4332 pmaxsw m5, m6 4333 pminsw m4, m7 4334 pminsw m5, m7 4335 mova [dstq+strideq*0], m4 4336 mova [dstq+strideq*1], m5 4337 lea dstq, [dstq+strideq*2] 4338 add acq, 64 4339 sub hd, 2 4340 jg .s16_loop 4341 RET 4342ALIGN function_align 4343.h32: 4344 mova m0, [tlq-64] 4345 paddw m0, [tlq-32] 4346 jmp wq 4347.w32: 4348 paddw m0, [tlq+ 2] 4349 paddw m0, [tlq+34] 4350 vextracti128 xm1, m0, 1 4351 paddw xm0, xm4 4352 paddw xm0, xm1 4353 punpcklwd xm1, xm0, xm6 4354 punpckhwd xm0, xm6 4355 paddd xm0, xm1 4356 psrlq xm1, xm0, 32 4357 paddd xm0, xm1 4358 psrldq xm1, xm0, 8 4359 paddd xm0, xm1 4360 psrld xm0, xm5 4361 cmp hd, 32 4362 je .w32_end 4363 lea r2d, [hq*2] 4364 mov r6d, 0x6667AAAB 4365 shrx r6d, r6d, r2d 4366 movd xm1, r6d 4367 pmulhuw xm0, xm1 4368 psrlw xm0, 1 4369.w32_end: 4370 vpbroadcastw m0, xm0 4371.s32: 4372 vpbroadcastw m1, alpham 4373 pabsw m2, m1 4374 psllw m2, 9 4375.s32_loop: 4376 mova m4, [acq] 4377 mova m5, [acq+32] 4378 IPRED_CFL 4 4379 IPRED_CFL 5 4380 pmaxsw m4, m6 4381 pmaxsw m5, m6 4382 pminsw m4, m7 4383 pminsw m5, m7 4384 mova [dstq+32*0], m4 4385 mova [dstq+32*1], m5 4386 add dstq, strideq 4387 add acq, 64 4388 dec hd 4389 jg .s32_loop 4390 RET 4391 4392cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha 4393 mov r6d, r7m 4394 shr r6d, 11 4395 lea t0, [ipred_cfl_splat_16bpc_avx2_table] 4396 tzcnt wd, wd 4397 movifnidn hd, hm 4398 movsxd wq, [t0+wq*4] 4399 vpbroadcastd m0, [t0-ipred_cfl_splat_16bpc_avx2_table+pw_512+r6*4] 4400 pxor m6, m6 4401 vpbroadcastw m7, r7m 4402 add wq, t0 4403 movifnidn acq, acmp 4404 jmp wq 4405 4406cglobal ipred_cfl_ac_420_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h 4407 movifnidn hpadd, hpadm 4408 vpbroadcastd m5, [pw_2] 4409 mov hd, hm 4410 shl hpadd, 2 4411 pxor m4, m4 4412 sub hd, hpadd 4413 cmp dword wm, 8 4414 jg .w16 4415 je .w8 4416.w4: 4417 lea r3, [strideq*3] 4418 mov r5, acq 4419.w4_loop: 4420 mova xm0, [ypxq+strideq*2] 4421 mova xm1, [ypxq+r3 ] 4422 vinserti128 m0, [ypxq+strideq*0], 1 4423 vinserti128 m1, [ypxq+strideq*1], 1 4424 lea ypxq, [ypxq+strideq*4] 4425 pmaddwd m0, m5 4426 pmaddwd m1, m5 4427 paddd m0, m1 4428 vextracti128 xm1, m0, 1 4429 paddd m4, m0 4430 packssdw xm1, xm0 4431 mova [acq], xm1 4432 add acq, 16 4433 sub hd, 2 4434 jg .w4_loop 4435 test hpadd, hpadd 4436 jz .dc 4437 vpermq m1, m1, q1111 4438 pslld xm0, 2 4439.w4_hpad_loop: 4440 mova [acq], m1 4441 paddd m4, m0 4442 add acq, 32 4443 sub hpadd, 4 4444 jg .w4_hpad_loop 4445 jmp .dc 4446.w8: 4447 mov r5, acq 4448 test wpadd, wpadd 4449 jnz .w8_wpad1 4450.w8_loop: 4451 pmaddwd m0, m5, [ypxq+strideq*0] 4452 pmaddwd m1, m5, [ypxq+strideq*1] 4453 lea ypxq, [ypxq+strideq*2] 4454 paddd m0, m1 4455 vextracti128 xm1, m0, 1 4456 paddd m4, m0 4457 packssdw xm1, xm0, xm1 4458 mova [acq], xm1 4459 add acq, 16 4460 dec hd 4461 jg .w8_loop 4462.w8_hpad: 4463 test hpadd, hpadd 4464 jz .dc 4465 vinserti128 m1, xm1, 1 4466 pslld m0, 2 4467 jmp .hpad 4468.w8_wpad1: 4469 pmaddwd xm0, xm5, [ypxq+strideq*0] 4470 pmaddwd xm3, xm5, [ypxq+strideq*1] 4471 lea ypxq, [ypxq+strideq*2] 4472 paddd xm0, xm3 4473 pshufd xm3, xm0, q3333 4474 packssdw xm1, xm0, xm3 4475 paddd xm0, xm3 4476 paddd xm4, xm0 4477 mova [acq], xm1 4478 add acq, 16 4479 dec hd 4480 jg .w8_wpad1 4481 jmp .w8_hpad 4482.w16_wpad: 4483 mova m0, [ypxq+strideq*0+ 0] 4484 mova m1, [ypxq+strideq*1+ 0] 4485 cmp wpadd, 2 4486 jl .w16_wpad1 4487 je .w16_wpad2 4488 vpbroadcastd m2, [ypxq+strideq*0+12] 4489 vpbroadcastd m3, [ypxq+strideq*1+12] 4490 vpblendd m0, m2, 0xf0 4491 vpblendd m1, m3, 0xf0 4492 jmp .w16_wpad_end 4493.w16_wpad2: 4494 vpbroadcastd m2, [ypxq+strideq*0+28] 4495 vpbroadcastd m3, [ypxq+strideq*1+28] 4496 jmp .w16_wpad_end 4497.w16_wpad1: 4498 vpbroadcastd m2, [ypxq+strideq*0+44] 4499 vpbroadcastd m3, [ypxq+strideq*1+44] 4500 vinserti128 m2, [ypxq+strideq*0+32], 0 4501 vinserti128 m3, [ypxq+strideq*1+32], 0 4502.w16_wpad_end: 4503 lea ypxq, [ypxq+strideq*2] 4504 REPX {pmaddwd x, m5}, m0, m1, m2, m3 4505 paddd m0, m1 4506 paddd m2, m3 4507 packssdw m1, m0, m2 4508 paddd m0, m2 4509 vpermq m1, m1, q3120 4510 paddd m4, m0 4511 mova [acq], m1 4512 add acq, 32 4513 dec hd 4514 jg .w16_wpad 4515 jmp .w16_hpad 4516.w16: 4517 mov r5, acq 4518 test wpadd, wpadd 4519 jnz .w16_wpad 4520.w16_loop: 4521 pmaddwd m0, m5, [ypxq+strideq*0+ 0] 4522 pmaddwd m2, m5, [ypxq+strideq*0+32] 4523 pmaddwd m1, m5, [ypxq+strideq*1+ 0] 4524 pmaddwd m3, m5, [ypxq+strideq*1+32] 4525 lea ypxq, [ypxq+strideq*2] 4526 paddd m0, m1 4527 paddd m2, m3 4528 packssdw m1, m0, m2 4529 paddd m0, m2 4530 vpermq m1, m1, q3120 4531 paddd m4, m0 4532 mova [acq], m1 4533 add acq, 32 4534 dec hd 4535 jg .w16_loop 4536.w16_hpad: 4537 add hpadd, hpadd 4538 jz .dc 4539 paddd m0, m0 4540.hpad: 4541 mova [acq+32*0], m1 4542 paddd m4, m0 4543 mova [acq+32*1], m1 4544 add acq, 32*2 4545 sub hpadd, 4 4546 jg .hpad 4547.dc: 4548 vextracti128 xm1, m4, 1 4549 sub r5, acq ; -w*h*2 4550 tzcnt r1d, r5d 4551 paddd xm4, xm1 4552 sub r1d, 2 4553 punpckhqdq xm1, xm4, xm4 4554 movd xm0, r1d 4555 paddd xm1, xm4 4556 pshuflw xm4, xm1, q1032 4557 paddd xm1, xm4 4558 psrld xm1, xm0 4559 pxor xm0, xm0 4560 pavgw xm1, xm0 4561 vpbroadcastw m1, xm1 4562.dc_loop: 4563 mova m0, [acq+r5] 4564 psubw m0, m1 4565 mova [acq+r5], m0 4566 add r5, 32 4567 jl .dc_loop 4568 RET 4569 4570cglobal ipred_cfl_ac_422_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h 4571 movifnidn hpadd, hpadm 4572 vpbroadcastd m5, [pw_4] 4573 mov hd, hm 4574 shl hpadd, 2 4575 pxor m4, m4 4576 sub hd, hpadd 4577 cmp dword wm, 8 4578 jg .w16 4579 je .w8 4580.w4: 4581 lea r3, [strideq*3] 4582 mov r5, acq 4583.w4_loop: 4584 mova xm0, [ypxq+strideq*0] 4585 mova xm1, [ypxq+strideq*1] 4586 vinserti128 m0, [ypxq+strideq*2], 1 4587 vinserti128 m1, [ypxq+r3 ], 1 4588 lea ypxq, [ypxq+strideq*4] 4589 pmaddwd m0, m5 4590 pmaddwd m1, m5 4591 paddd m4, m0 4592 packssdw m0, m1 4593 paddd m4, m1 4594 mova [acq], m0 4595 add acq, 32 4596 sub hd, 4 4597 jg .w4_loop 4598 test hpadd, hpadd 4599 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4600 vextracti128 xm1, m1, 1 4601 vpermq m0, m0, q3333 4602 pslld xm1, 2 4603.w4_hpad_loop: 4604 mova [acq], m0 4605 paddd m4, m1 4606 add acq, 32 4607 sub hpadd, 4 4608 jg .w4_hpad_loop 4609 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4610.w8: 4611 mov r5, acq 4612 test wpadd, wpadd 4613 jnz .w8_wpad1 4614.w8_loop: 4615 pmaddwd m1, m5, [ypxq+strideq*0] 4616 pmaddwd m0, m5, [ypxq+strideq*1] 4617 lea ypxq, [ypxq+strideq*2] 4618 paddd m4, m1 4619 packssdw m1, m0 4620 paddd m4, m0 4621 vpermq m2, m1, q3120 4622 mova [acq], m2 4623 add acq, 32 4624 sub hd, 2 4625 jg .w8_loop 4626.w8_hpad: 4627 test hpadd, hpadd 4628 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4629 vpermq m1, m1, q3131 4630 pslld m0, 2 4631 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad 4632.w8_wpad1: 4633 vpbroadcastd m1, [ypxq+strideq*0+12] 4634 vpbroadcastd m0, [ypxq+strideq*1+12] 4635 vinserti128 m1, [ypxq+strideq*0+ 0], 0 4636 vinserti128 m0, [ypxq+strideq*1+ 0], 0 4637 lea ypxq, [ypxq+strideq*2] 4638 pmaddwd m1, m5 4639 pmaddwd m0, m5 4640 paddd m4, m1 4641 packssdw m1, m0 4642 paddd m4, m0 4643 vpermq m2, m1, q3120 4644 mova [acq], m2 4645 add acq, 32 4646 sub hd, 2 4647 jg .w8_wpad1 4648 jmp .w8_hpad 4649.w16: 4650 mov r5, acq 4651 test wpadd, wpadd 4652 jnz .w16_wpad 4653.w16_loop: 4654 pmaddwd m2, m5, [ypxq+strideq*0+ 0] 4655 pmaddwd m1, m5, [ypxq+strideq*0+32] 4656 pmaddwd m0, m5, [ypxq+strideq*1+ 0] 4657 pmaddwd m3, m5, [ypxq+strideq*1+32] 4658 lea ypxq, [ypxq+strideq*2] 4659 paddd m4, m2 4660 packssdw m2, m1 4661 paddd m4, m1 4662 packssdw m1, m0, m3 4663 paddd m0, m3 4664 vpermq m2, m2, q3120 4665 paddd m4, m0 4666 vpermq m1, m1, q3120 4667 mova [acq+32*0], m2 4668 mova [acq+32*1], m1 4669 add acq, 32*2 4670 sub hd, 2 4671 jg .w16_loop 4672 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad 4673.w16_wpad: 4674 mova m2, [ypxq+strideq*0+ 0] 4675 mova m0, [ypxq+strideq*1+ 0] 4676 cmp wpadd, 2 4677 jl .w16_wpad1 4678 je .w16_wpad2 4679 vpbroadcastd m1, [ypxq+strideq*0+12] 4680 vpbroadcastd m3, [ypxq+strideq*1+12] 4681 vpblendd m2, m1, 0xf0 4682 vpblendd m0, m3, 0xf0 4683 jmp .w16_wpad_end 4684.w16_wpad2: 4685 vpbroadcastd m1, [ypxq+strideq*0+28] 4686 vpbroadcastd m3, [ypxq+strideq*1+28] 4687 jmp .w16_wpad_end 4688.w16_wpad1: 4689 vpbroadcastd m1, [ypxq+strideq*0+44] 4690 vpbroadcastd m3, [ypxq+strideq*1+44] 4691 vinserti128 m1, [ypxq+strideq*0+32], 0 4692 vinserti128 m3, [ypxq+strideq*1+32], 0 4693.w16_wpad_end: 4694 lea ypxq, [ypxq+strideq*2] 4695 REPX {pmaddwd x, m5}, m2, m0, m1, m3 4696 paddd m4, m2 4697 packssdw m2, m1 4698 paddd m4, m1 4699 packssdw m1, m0, m3 4700 paddd m0, m3 4701 vpermq m2, m2, q3120 4702 paddd m4, m0 4703 vpermq m1, m1, q3120 4704 mova [acq+32*0], m2 4705 mova [acq+32*1], m1 4706 add acq, 32*2 4707 sub hd, 2 4708 jg .w16_wpad 4709 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad 4710 4711cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h 4712 lea r6, [ipred_cfl_ac_444_16bpc_avx2_table] 4713 tzcnt wd, wm 4714 movifnidn hpadd, hpadm 4715 vpbroadcastd m5, [pw_1] 4716 movsxd wq, [r6+wq*4] 4717 shl hpadd, 2 4718 add wq, r6 4719 mov hd, hm 4720 pxor m4, m4 4721 sub hd, hpadd 4722 jmp wq 4723.w4: 4724 lea r3, [strideq*3] 4725 mov r5, acq 4726.w4_loop: 4727 movq xm0, [ypxq+strideq*0] 4728 movhps xm0, [ypxq+strideq*1] 4729 vpbroadcastq m1, [ypxq+strideq*2] 4730 vpbroadcastq m2, [ypxq+r3 ] 4731 lea ypxq, [ypxq+strideq*4] 4732 vpblendd m0, m1, 0x30 4733 vpblendd m0, m2, 0xc0 4734 psllw m0, 3 4735 pmaddwd m1, m0, m5 4736 mova [acq], m0 4737 add acq, 32 4738 paddd m4, m1 4739 sub hd, 4 4740 jg .w4_loop 4741 test hpadd, hpadd 4742 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4743 vpermq m0, m0, q3333 4744 paddd m1, m1 4745 mova [acq+32*0], m0 4746 vpermq m1, m1, q3333 4747 mova [acq+32*1], m0 4748 add acq, 32*2 4749 paddd m4, m1 4750 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4751.w8: 4752 lea r3, [strideq*3] 4753 mov r5, acq 4754.w8_loop: 4755 mova xm2, [ypxq+strideq*0] 4756 vinserti128 m2, [ypxq+strideq*1], 1 4757 mova xm1, [ypxq+strideq*2] 4758 vinserti128 m1, [ypxq+r3 ], 1 4759 lea ypxq, [ypxq+strideq*4] 4760 psllw m2, 3 4761 psllw m1, 3 4762 mova [acq+32*0], m2 4763 pmaddwd m2, m5 4764 mova [acq+32*1], m1 4765 pmaddwd m0, m1, m5 4766 add acq, 32*2 4767 paddd m4, m2 4768 paddd m4, m0 4769 sub hd, 4 4770 jg .w8_loop 4771 test hpadd, hpadd 4772 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4773 vperm2i128 m1, m1, 0x11 4774 pslld m0, 2 4775 pxor m2, m2 4776 vpblendd m0, m2, 0x0f 4777 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad 4778.w16_wpad2: 4779 vpbroadcastw m3, [ypxq+strideq*0+14] 4780 vpbroadcastw m0, [ypxq+strideq*1+14] 4781 vpblendd m2, m3, 0xf0 4782 vpblendd m1, m0, 0xf0 4783 jmp .w16_wpad_end 4784.w16: 4785 mov r5, acq 4786.w16_loop: 4787 mova m2, [ypxq+strideq*0] 4788 mova m1, [ypxq+strideq*1] 4789 test wpadd, wpadd 4790 jnz .w16_wpad2 4791.w16_wpad_end: 4792 lea ypxq, [ypxq+strideq*2] 4793 psllw m2, 3 4794 psllw m1, 3 4795 mova [acq+32*0], m2 4796 pmaddwd m2, m5 4797 mova [acq+32*1], m1 4798 pmaddwd m0, m1, m5 4799 add acq, 32*2 4800 paddd m4, m2 4801 paddd m4, m0 4802 sub hd, 2 4803 jg .w16_loop 4804 add hpadd, hpadd 4805 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4806 paddd m0, m0 4807 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad 4808.w32: 4809 mov r5, acq 4810 test wpadd, wpadd 4811 jnz .w32_wpad 4812.w32_loop: 4813 mova m0, [ypxq+ 0] 4814 mova m1, [ypxq+32] 4815 add ypxq, strideq 4816 psllw m0, 3 4817 psllw m1, 3 4818 pmaddwd m2, m0, m5 4819 mova [acq+32*0], m0 4820 pmaddwd m3, m1, m5 4821 mova [acq+32*1], m1 4822 add acq, 32*2 4823 paddd m2, m3 4824 paddd m4, m2 4825 dec hd 4826 jg .w32_loop 4827.w32_hpad: 4828 test hpadd, hpadd 4829 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4830 paddd m2, m2 4831.w32_hpad_loop: 4832 mova [acq+32*0], m0 4833 mova [acq+32*1], m1 4834 paddd m4, m2 4835 mova [acq+32*2], m0 4836 mova [acq+32*3], m1 4837 add acq, 32*4 4838 sub hpadd, 2 4839 jg .w32_hpad_loop 4840 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4841.w32_wpad: 4842 mova m0, [ypxq+ 0] 4843 cmp wpadd, 4 4844 jl .w32_wpad2 4845 je .w32_wpad4 4846 vpbroadcastw m1, [ypxq+14] 4847 vpblendd m0, m1, 0xf0 4848 jmp .w32_wpad_end 4849.w32_wpad4: 4850 vpbroadcastw m1, [ypxq+30] 4851 jmp .w32_wpad_end 4852.w32_wpad2: 4853 vpbroadcastw m1, [ypxq+46] 4854 vinserti128 m1, [ypxq+32], 0 4855.w32_wpad_end: 4856 add ypxq, strideq 4857 psllw m0, 3 4858 psllw m1, 3 4859 pmaddwd m2, m0, m5 4860 mova [acq+32*0], m0 4861 pmaddwd m3, m1, m5 4862 mova [acq+32*1], m1 4863 add acq, 32*2 4864 paddd m2, m3 4865 paddd m4, m2 4866 dec hd 4867 jg .w32_wpad 4868 jmp .w32_hpad 4869 4870cglobal pal_pred_16bpc, 4, 6, 6, dst, stride, pal, idx, w, h 4871 vbroadcasti128 m4, [palq] 4872 lea r2, [pal_pred_16bpc_avx2_table] 4873 tzcnt wd, wm 4874 vbroadcasti128 m5, [pal_pred_shuf] 4875 movifnidn hd, hm 4876 movsxd wq, [r2+wq*4] 4877 pshufb m4, m5 4878 punpckhqdq m5, m4, m4 4879 add wq, r2 4880DEFINE_ARGS dst, stride, stride3, idx, w, h 4881 lea stride3q, [strideq*3] 4882 jmp wq 4883.w4: 4884 movq xm0, [idxq] 4885 add idxq, 8 4886 psrlw xm1, xm0, 4 4887 punpcklbw xm0, xm1 4888 pshufb xm1, xm4, xm0 4889 pshufb xm2, xm5, xm0 4890 punpcklbw xm0, xm1, xm2 4891 punpckhbw xm1, xm2 4892 movq [dstq+strideq*0], xm0 4893 movq [dstq+strideq*2], xm1 4894 movhps [dstq+strideq*1], xm0 4895 movhps [dstq+stride3q ], xm1 4896 lea dstq, [dstq+strideq*4] 4897 sub hd, 4 4898 jg .w4 4899 RET 4900.w8: 4901 pmovzxbw m2, [idxq] 4902 add idxq, 16 4903 psllw m1, m2, 4 4904 por m2, m1 4905 pshufb m1, m4, m2 4906 pshufb m2, m5, m2 4907 punpcklbw m0, m1, m2 4908 punpckhbw m1, m2 4909 mova [dstq+strideq*0], xm0 4910 mova [dstq+strideq*1], xm1 4911 vextracti128 [dstq+strideq*2], m0, 1 4912 vextracti128 [dstq+stride3q ], m1, 1 4913 lea dstq, [dstq+strideq*4] 4914 sub hd, 4 4915 jg .w8 4916 RET 4917.w16: 4918 pshufd m3, [idxq], q3120 4919 add idxq, 32 4920 vpermq m3, m3, q3120 4921 psrlw m1, m3, 4 4922 punpcklbw m2, m3, m1 4923 punpckhbw m3, m1 4924 pshufb m1, m4, m2 4925 pshufb m2, m5, m2 4926 punpcklbw m0, m1, m2 4927 punpckhbw m1, m2 4928 mova [dstq+strideq*0], m0 4929 mova [dstq+strideq*1], m1 4930 pshufb m1, m4, m3 4931 pshufb m3, m5, m3 4932 punpcklbw m0, m1, m3 4933 punpckhbw m1, m3 4934 mova [dstq+strideq*2], m0 4935 mova [dstq+stride3q ], m1 4936 lea dstq, [dstq+strideq*4] 4937 sub hd, 4 4938 jg .w16 4939 RET 4940.w32: 4941 pshufd m3, [idxq], q3120 4942 add idxq, 32 4943 vpermq m3, m3, q3120 4944 psrlw m1, m3, 4 4945 punpcklbw m2, m3, m1 4946 punpckhbw m3, m1 4947 pshufb m1, m4, m2 4948 pshufb m2, m5, m2 4949 punpcklbw m0, m1, m2 4950 punpckhbw m1, m2 4951 mova [dstq+ 0], m0 4952 mova [dstq+32], m1 4953 pshufb m1, m4, m3 4954 pshufb m3, m5, m3 4955 punpcklbw m0, m1, m3 4956 punpckhbw m1, m3 4957 mova [dstq+strideq+ 0], m0 4958 mova [dstq+strideq+32], m1 4959 lea dstq, [dstq+strideq*2] 4960 sub hd, 2 4961 jg .w32 4962 RET 4963.w64: 4964 pshufd m3, [idxq], q3120 4965 add idxq, 32 4966 vpermq m3, m3, q3120 4967 psrlw m1, m3, 4 4968 punpcklbw m2, m3, m1 4969 punpckhbw m3, m1 4970 pshufb m1, m4, m2 4971 pshufb m2, m5, m2 4972 punpcklbw m0, m1, m2 4973 punpckhbw m1, m2 4974 mova [dstq+32*0], m0 4975 mova [dstq+32*1], m1 4976 pshufb m1, m4, m3 4977 pshufb m3, m5, m3 4978 punpcklbw m0, m1, m3 4979 punpckhbw m1, m3 4980 mova [dstq+32*2], m0 4981 mova [dstq+32*3], m1 4982 add dstq, strideq 4983 dec hd 4984 jg .w64 4985 RET 4986 4987%endif 4988