1; Copyright © 2020, VideoLAN and dav1d authors 2; Copyright © 2020, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 64 32 33spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 34 db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41 35spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17 36 db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49 37 db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25 38 db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57 39spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 40 db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45 41spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21 42 db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53 43 db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29 44 db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61 45spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 46 db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 47 db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 48 db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 49spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 50 db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 51 db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 52 db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 53prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 54 db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 55 db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 56 db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 57prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46 58 db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62 59 db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110 60 db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126 61prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78 62 db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94 63 db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110 64 db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126 65spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 66 db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46 67 db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 68 db 49, 50, 65, 66, 53, 54, 69, 70, 57, 58, 73, 74, 61, 62, 77, 78 69spel_shuf4b: db 50, 51, 65, 66, 54, 55, 69, 70, 58, 59, 73, 74, 62, 63, 77, 78 70 db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 71 db 81, 82, 97, 98, 85, 86,101,102, 89, 90,105,106, 93, 94,109,110 72 db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 73spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 74 db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78 75 db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 76 db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110 77spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78 78 db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 79 db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110 80 db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 81spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46 82 db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62 83 db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110 84 db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126 85spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78 86 db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94 87 db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110 88 db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126 89spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38 90 db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14 91 db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46 92spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30 93spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21 94 db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25 95w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 96 db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 97w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 98 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 99 db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 100 db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 101w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 102 db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 103 db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94 104 db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126 105w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 106 db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 107 db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94 108 db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126 109w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 110 db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 111 db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110 112 db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126 113warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37 114 db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41 115 db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45 116 db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 117warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 118 db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53 119 db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57 120 db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61 121warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 122 db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 123 db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 124 db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 125deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7 126pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 127 dd 1 128pw_2048: times 2 dw 2048 129 dd 3 130pw_8192: times 2 dw 8192 131avg_shift: dw 5, 5, 3, 3 132pw_27615: times 2 dw 27615 133pw_32766: times 2 dw 32766 134warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 135warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 136warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53 137blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 138resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29 139resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31 140resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13 141resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15 142resize_permE: dq 0, 2, 4, 6 143resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13 144resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15 145rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 146resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 147 db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 148 149prep_hv_shift: dq 6, 4 150put_bilin_h_rnd: dw 8, 8, 10, 10 151prep_mul: dw 16, 16, 4, 4 152put_8tap_h_rnd: dd 34, 40 153prep_8tap_rnd: dd 128 - (8192 << 8) 154warp_8x8_rnd_h: dd 512, 2048 155warp_8x8_rnd_v: dd 262144, 65536 156warp_8x8t_rnd_v: dd 16384 - (8192 << 15) 157avg_round: dw -16400, -16400, -16388, -16388 158w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4) 159mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6) 160w_mask_round: dd 128, 64 161bidir_shift: dw 6, 6, 4, 4 162 163pb_64: times 4 db 64 164pw_m512: times 2 dw -512 165pw_2: times 2 dw 2 166pw_64: times 2 dw 64 167pd_32: dd 32 168pd_63: dd 63 169pd_128: dd 128 170pd_640: dd 640 171pd_2176: dd 2176 172pd_16384: dd 16384 173pd_0_4: dd 0, 4 174 175%define pw_16 prep_mul 176%define pd_512 warp_8x8_rnd_h 177 178%macro BASE_JMP_TABLE 3-* 179 %xdefine %1_%2_table (%%table - %3) 180 %xdefine %%base %1_%2 181 %%table: 182 %rep %0 - 2 183 dw %%base %+ _w%3 - %%base 184 %rotate 1 185 %endrep 186%endmacro 187 188%macro HV_JMP_TABLE 5-* 189 %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) 190 %xdefine %%base %1_%3 191 %assign %%types %4 192 %if %%types & 1 193 %xdefine %1_%2_h_%3_table (%%h - %5) 194 %%h: 195 %rep %0 - 4 196 dw %%prefix %+ .h_w%5 - %%base 197 %rotate 1 198 %endrep 199 %rotate 4 200 %endif 201 %if %%types & 2 202 %xdefine %1_%2_v_%3_table (%%v - %5) 203 %%v: 204 %rep %0 - 4 205 dw %%prefix %+ .v_w%5 - %%base 206 %rotate 1 207 %endrep 208 %rotate 4 209 %endif 210 %if %%types & 4 211 %xdefine %1_%2_hv_%3_table (%%hv - %5) 212 %%hv: 213 %rep %0 - 4 214 dw %%prefix %+ .hv_w%5 - %%base 215 %rotate 1 216 %endrep 217 %endif 218%endmacro 219 220%macro BIDIR_JMP_TABLE 2-* 221 %xdefine %1_%2_table (%%table - 2*%3) 222 %xdefine %%base %1_%2_table 223 %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) 224 %%table: 225 %rep %0 - 2 226 dd %%prefix %+ .w%3 - %%base 227 %rotate 1 228 %endrep 229%endmacro 230 231%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put) 232%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep) 233 234BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 235BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 236BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 237BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 238BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 239BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 240BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 241BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32 242BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128 243BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 244BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 245HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 246HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 247HV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 248HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 249HV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 250HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 251 252%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX 253 254cextern mc_subpel_filters 255%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) 256 257cextern mc_warp_filter 258cextern obmc_masks_avx2 259cextern resize_filter 260 261SECTION .text 262 263%if WIN64 264DECLARE_REG_TMP 4 265%else 266DECLARE_REG_TMP 8 267%endif 268 269INIT_ZMM avx512icl 270cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy 271 mov mxyd, r6m ; mx 272 lea r7, [put_avx512icl] 273 tzcnt t0d, wm 274 movifnidn hd, hm 275 test mxyd, mxyd 276 jnz .h 277 mov mxyd, r7m ; my 278 test mxyd, mxyd 279 jnz .v 280.put: 281 movzx t0d, word [r7+t0*2+table_offset(put,)] 282 add t0, r7 283 jmp t0 284.put_w2: 285 mov r6d, [srcq+ssq*0] 286 mov r7d, [srcq+ssq*1] 287 lea srcq, [srcq+ssq*2] 288 mov [dstq+dsq*0], r6d 289 mov [dstq+dsq*1], r7d 290 lea dstq, [dstq+dsq*2] 291 sub hd, 2 292 jg .put_w2 293 RET 294.put_w4: 295 mov r6, [srcq+ssq*0] 296 mov r7, [srcq+ssq*1] 297 lea srcq, [srcq+ssq*2] 298 mov [dstq+dsq*0], r6 299 mov [dstq+dsq*1], r7 300 lea dstq, [dstq+dsq*2] 301 sub hd, 2 302 jg .put_w4 303 RET 304.put_w8: 305 movu xmm0, [srcq+ssq*0] 306 movu xmm1, [srcq+ssq*1] 307 lea srcq, [srcq+ssq*2] 308 mova [dstq+dsq*0], xmm0 309 mova [dstq+dsq*1], xmm1 310 lea dstq, [dstq+dsq*2] 311 sub hd, 2 312 jg .put_w8 313 RET 314.put_w16: 315 movu ym0, [srcq+ssq*0] 316 movu ym1, [srcq+ssq*1] 317 lea srcq, [srcq+ssq*2] 318 mova [dstq+dsq*0], ym0 319 mova [dstq+dsq*1], ym1 320 lea dstq, [dstq+dsq*2] 321 sub hd, 2 322 jg .put_w16 323 RET 324.put_w32: 325 movu m0, [srcq+ssq*0] 326 movu m1, [srcq+ssq*1] 327 lea srcq, [srcq+ssq*2] 328 mova [dstq+dsq*0], m0 329 mova [dstq+dsq*1], m1 330 lea dstq, [dstq+dsq*2] 331 sub hd, 2 332 jg .put_w32 333 RET 334.put_w64: 335 movu m0, [srcq+ssq*0+64*0] 336 movu m1, [srcq+ssq*0+64*1] 337 movu m2, [srcq+ssq*1+64*0] 338 movu m3, [srcq+ssq*1+64*1] 339 lea srcq, [srcq+ssq*2] 340 mova [dstq+dsq*0+64*0], m0 341 mova [dstq+dsq*0+64*1], m1 342 mova [dstq+dsq*1+64*0], m2 343 mova [dstq+dsq*1+64*1], m3 344 lea dstq, [dstq+dsq*2] 345 sub hd, 2 346 jg .put_w64 347 RET 348.put_w128: 349 movu m0, [srcq+64*0] 350 movu m1, [srcq+64*1] 351 movu m2, [srcq+64*2] 352 movu m3, [srcq+64*3] 353 add srcq, ssq 354 mova [dstq+64*0], m0 355 mova [dstq+64*1], m1 356 mova [dstq+64*2], m2 357 mova [dstq+64*3], m3 358 add dstq, dsq 359 dec hd 360 jg .put_w128 361 RET 362.h: 363 vpbroadcastw m5, mxyd 364 mov mxyd, r7m ; my 365 vpbroadcastd m4, [pw_16] 366 psubw m4, m5 367 test mxyd, mxyd 368 jnz .hv 369 ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v 370 movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)] 371 mov r6d, r8m ; bitdepth_max 372 add t0, r7 373 shr r6d, 11 374 vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4] 375 jmp t0 376.h_w2: 377 movq xmm1, [srcq+ssq*0] 378 movhps xmm1, [srcq+ssq*1] 379 lea srcq, [srcq+ssq*2] 380 pmullw xmm0, xmm1, xm4 381 psrlq xmm1, 16 382 pmullw xmm1, xm5 383 paddw xmm0, xm6 384 paddw xmm0, xmm1 385 psrlw xmm0, 4 386 movd [dstq+dsq*0], xmm0 387 pextrd [dstq+dsq*1], xmm0, 2 388 lea dstq, [dstq+dsq*2] 389 sub hd, 2 390 jg .h_w2 391 RET 392.h_w4: 393 movq xmm0, [srcq+ssq*0+0] 394 movhps xmm0, [srcq+ssq*1+0] 395 movq xmm1, [srcq+ssq*0+2] 396 movhps xmm1, [srcq+ssq*1+2] 397 lea srcq, [srcq+ssq*2] 398 pmullw xmm0, xm4 399 pmullw xmm1, xm5 400 paddw xmm0, xm6 401 paddw xmm0, xmm1 402 psrlw xmm0, 4 403 movq [dstq+dsq*0], xmm0 404 movhps [dstq+dsq*1], xmm0 405 lea dstq, [dstq+dsq*2] 406 sub hd, 2 407 jg .h_w4 408 RET 409.h_w8: 410 movu xm0, [srcq+ssq*0+0] 411 vinserti32x4 ym0, [srcq+ssq*1+0], 1 412 movu xm1, [srcq+ssq*0+2] 413 vinserti32x4 ym1, [srcq+ssq*1+2], 1 414 lea srcq, [srcq+ssq*2] 415 pmullw ym0, ym4 416 pmullw ym1, ym5 417 paddw ym0, ym6 418 paddw ym0, ym1 419 psrlw ym0, 4 420 mova [dstq+dsq*0], xm0 421 vextracti32x4 [dstq+dsq*1], ym0, 1 422 lea dstq, [dstq+dsq*2] 423 sub hd, 2 424 jg .h_w8 425 RET 426.h_w16: 427 movu ym0, [srcq+ssq*0+0] 428 vinserti32x8 m0, [srcq+ssq*1+0], 1 429 movu ym1, [srcq+ssq*0+2] 430 vinserti32x8 m1, [srcq+ssq*1+2], 1 431 lea srcq, [srcq+ssq*2] 432 pmullw m0, m4 433 pmullw m1, m5 434 paddw m0, m6 435 paddw m0, m1 436 psrlw m0, 4 437 mova [dstq+dsq*0], ym0 438 vextracti32x8 [dstq+dsq*1], m0, 1 439 lea dstq, [dstq+dsq*2] 440 sub hd, 2 441 jg .h_w16 442 RET 443.h_w32: 444 pmullw m0, m4, [srcq+ssq*0+0] 445 pmullw m2, m5, [srcq+ssq*0+2] 446 pmullw m1, m4, [srcq+ssq*1+0] 447 pmullw m3, m5, [srcq+ssq*1+2] 448 lea srcq, [srcq+ssq*2] 449 paddw m0, m6 450 paddw m1, m6 451 paddw m0, m2 452 paddw m1, m3 453 psrlw m0, 4 454 psrlw m1, 4 455 mova [dstq+dsq*0], m0 456 mova [dstq+dsq*1], m1 457 lea dstq, [dstq+dsq*2] 458 sub hd, 2 459 jg .h_w32 460 RET 461.h_w64: 462 pmullw m0, m4, [srcq+64*0+0] 463 pmullw m2, m5, [srcq+64*0+2] 464 pmullw m1, m4, [srcq+64*1+0] 465 pmullw m3, m5, [srcq+64*1+2] 466 add srcq, ssq 467 paddw m0, m6 468 paddw m1, m6 469 paddw m0, m2 470 paddw m1, m3 471 psrlw m0, 4 472 psrlw m1, 4 473 mova [dstq+64*0], m0 474 mova [dstq+64*1], m1 475 add dstq, dsq 476 dec hd 477 jg .h_w64 478 RET 479.h_w128: 480 pmullw m0, m4, [srcq+64*0+0] 481 pmullw m7, m5, [srcq+64*0+2] 482 pmullw m1, m4, [srcq+64*1+0] 483 pmullw m8, m5, [srcq+64*1+2] 484 pmullw m2, m4, [srcq+64*2+0] 485 pmullw m9, m5, [srcq+64*2+2] 486 pmullw m3, m4, [srcq+64*3+0] 487 pmullw m10, m5, [srcq+64*3+2] 488 add srcq, ssq 489 REPX {paddw x, m6}, m0, m1, m2, m3 490 paddw m0, m7 491 paddw m1, m8 492 paddw m2, m9 493 paddw m3, m10 494 REPX {psrlw x, 4}, m0, m1, m2, m3 495 mova [dstq+64*0], m0 496 mova [dstq+64*1], m1 497 mova [dstq+64*2], m2 498 mova [dstq+64*3], m3 499 add dstq, dsq 500 dec hd 501 jg .h_w128 502 RET 503.v: 504 movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)] 505 shl mxyd, 11 506 vpbroadcastw m8, mxyd 507 add t0, r7 508 jmp t0 509.v_w2: 510 movd xmm0, [srcq+ssq*0] 511.v_w2_loop: 512 movd xmm1, [srcq+ssq*1] 513 lea srcq, [srcq+ssq*2] 514 punpckldq xmm2, xmm0, xmm1 515 movd xmm0, [srcq+ssq*0] 516 punpckldq xmm1, xmm0 517 psubw xmm1, xmm2 518 pmulhrsw xmm1, xm8 519 paddw xmm1, xmm2 520 movd [dstq+dsq*0], xmm1 521 pextrd [dstq+dsq*1], xmm1, 1 522 lea dstq, [dstq+dsq*2] 523 sub hd, 2 524 jg .v_w2_loop 525 RET 526.v_w4: 527 movq xmm0, [srcq+ssq*0] 528.v_w4_loop: 529 movq xmm1, [srcq+ssq*1] 530 lea srcq, [srcq+ssq*2] 531 punpcklqdq xmm2, xmm0, xmm1 532 movq xmm0, [srcq+ssq*0] 533 punpcklqdq xmm1, xmm0 534 psubw xmm1, xmm2 535 pmulhrsw xmm1, xm8 536 paddw xmm1, xmm2 537 movq [dstq+dsq*0], xmm1 538 movhps [dstq+dsq*1], xmm1 539 lea dstq, [dstq+dsq*2] 540 sub hd, 2 541 jg .v_w4_loop 542 RET 543.v_w8: 544 movu xmm0, [srcq+ssq*0] 545.v_w8_loop: 546 vbroadcasti128 ymm1, [srcq+ssq*1] 547 lea srcq, [srcq+ssq*2] 548 vpblendd ymm2, ymm0, ymm1, 0xf0 549 vbroadcasti128 ymm0, [srcq+ssq*0] 550 vpblendd ymm1, ymm0, 0xf0 551 psubw ymm1, ymm2 552 pmulhrsw ymm1, ym8 553 paddw ymm1, ymm2 554 mova [dstq+dsq*0], xmm1 555 vextracti128 [dstq+dsq*1], ymm1, 1 556 lea dstq, [dstq+dsq*2] 557 sub hd, 2 558 jg .v_w8_loop 559 vzeroupper 560 RET 561.v_w16: 562 movu ym0, [srcq+ssq*0] 563.v_w16_loop: 564 movu ym3, [srcq+ssq*1] 565 lea srcq, [srcq+ssq*2] 566 psubw ym1, ym3, ym0 567 pmulhrsw ym1, ym8 568 paddw ym1, ym0 569 movu ym0, [srcq+ssq*0] 570 psubw ym2, ym0, ym3 571 pmulhrsw ym2, ym8 572 paddw ym2, ym3 573 mova [dstq+dsq*0], ym1 574 mova [dstq+dsq*1], ym2 575 lea dstq, [dstq+dsq*2] 576 sub hd, 2 577 jg .v_w16_loop 578 RET 579.v_w32: 580 movu m0, [srcq+ssq*0] 581.v_w32_loop: 582 movu m3, [srcq+ssq*1] 583 lea srcq, [srcq+ssq*2] 584 psubw m1, m3, m0 585 pmulhrsw m1, m8 586 paddw m1, m0 587 movu m0, [srcq+ssq*0] 588 psubw m2, m0, m3 589 pmulhrsw m2, m8 590 paddw m2, m3 591 mova [dstq+dsq*0], m1 592 mova [dstq+dsq*1], m2 593 lea dstq, [dstq+dsq*2] 594 sub hd, 2 595 jg .v_w32_loop 596 RET 597.v_w64: 598 movu m0, [srcq+ssq*0+64*0] 599 movu m1, [srcq+ssq*0+64*1] 600.v_w64_loop: 601 movu m2, [srcq+ssq*1+64*0] 602 movu m3, [srcq+ssq*1+64*1] 603 lea srcq, [srcq+ssq*2] 604 psubw m4, m2, m0 605 pmulhrsw m4, m8 606 paddw m4, m0 607 movu m0, [srcq+ssq*0+64*0] 608 psubw m5, m3, m1 609 pmulhrsw m5, m8 610 paddw m5, m1 611 movu m1, [srcq+ssq*0+64*1] 612 psubw m6, m0, m2 613 pmulhrsw m6, m8 614 psubw m7, m1, m3 615 pmulhrsw m7, m8 616 mova [dstq+dsq*0+64*0], m4 617 mova [dstq+dsq*0+64*1], m5 618 paddw m6, m2 619 paddw m7, m3 620 mova [dstq+dsq*1+64*0], m6 621 mova [dstq+dsq*1+64*1], m7 622 lea dstq, [dstq+dsq*2] 623 sub hd, 2 624 jg .v_w64_loop 625 RET 626.v_w128: 627 movu m0, [srcq+ssq*0+64*0] 628 movu m1, [srcq+ssq*0+64*1] 629 movu m2, [srcq+ssq*0+64*2] 630 movu m3, [srcq+ssq*0+64*3] 631.v_w128_loop: 632 movu m4, [srcq+ssq*1+64*0] 633 movu m5, [srcq+ssq*1+64*1] 634 movu m6, [srcq+ssq*1+64*2] 635 movu m7, [srcq+ssq*1+64*3] 636 lea srcq, [srcq+ssq*2] 637 psubw m9, m4, m0 638 pmulhrsw m9, m8 639 paddw m9, m0 640 movu m0, [srcq+ssq*0+64*0] 641 psubw m10, m5, m1 642 pmulhrsw m10, m8 643 paddw m10, m1 644 movu m1, [srcq+ssq*0+64*1] 645 psubw m11, m6, m2 646 pmulhrsw m11, m8 647 paddw m11, m2 648 movu m2, [srcq+ssq*0+64*2] 649 psubw m12, m7, m3 650 pmulhrsw m12, m8 651 paddw m12, m3 652 movu m3, [srcq+ssq*0+64*3] 653 mova [dstq+dsq*0+64*0], m9 654 psubw m9, m0, m4 655 pmulhrsw m9, m8 656 mova [dstq+dsq*0+64*1], m10 657 psubw m10, m1, m5 658 pmulhrsw m10, m8 659 mova [dstq+dsq*0+64*2], m11 660 psubw m11, m2, m6 661 pmulhrsw m11, m8 662 mova [dstq+dsq*0+64*3], m12 663 psubw m12, m3, m7 664 pmulhrsw m12, m8 665 paddw m9, m4 666 paddw m10, m5 667 mova [dstq+dsq*1+64*0], m9 668 mova [dstq+dsq*1+64*1], m10 669 paddw m11, m6 670 paddw m12, m7 671 mova [dstq+dsq*1+64*2], m11 672 mova [dstq+dsq*1+64*3], m12 673 lea dstq, [dstq+dsq*2] 674 sub hd, 2 675 jg .v_w128_loop 676 RET 677.hv: 678 movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)] 679 shl mxyd, 11 680 vpbroadcastd m6, [pw_2] 681 vpbroadcastw m7, mxyd 682 vpbroadcastd m8, [pw_8192] 683 add t0, r7 684 test dword r8m, 0x800 685 jnz .hv_12bpc 686 psllw m4, 2 687 psllw m5, 2 688 vpbroadcastd m8, [pw_2048] 689.hv_12bpc: 690 jmp t0 691.hv_w2: 692 vpbroadcastq xmm1, [srcq+ssq*0] 693 pmullw xmm0, xmm1, xm4 694 psrlq xmm1, 16 695 pmullw xmm1, xm5 696 paddw xmm0, xm6 697 paddw xmm0, xmm1 698 psrlw xmm0, 2 699.hv_w2_loop: 700 movq xmm2, [srcq+ssq*1] 701 lea srcq, [srcq+ssq*2] 702 movhps xmm2, [srcq+ssq*0] 703 pmullw xmm1, xmm2, xm4 704 psrlq xmm2, 16 705 pmullw xmm2, xm5 706 paddw xmm1, xm6 707 paddw xmm1, xmm2 708 psrlw xmm1, 2 ; 1 _ 2 _ 709 shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _ 710 mova xmm0, xmm1 711 psubw xmm1, xmm2 712 paddw xmm1, xmm1 713 pmulhw xmm1, xm7 714 paddw xmm1, xmm2 715 pmulhrsw xmm1, xm8 716 movd [dstq+dsq*0], xmm1 717 pextrd [dstq+dsq*1], xmm1, 2 718 lea dstq, [dstq+dsq*2] 719 sub hd, 2 720 jg .hv_w2_loop 721 RET 722.hv_w4: 723 pmullw xmm0, xm4, [srcq+ssq*0-8] 724 pmullw xmm1, xm5, [srcq+ssq*0-6] 725 paddw xmm0, xm6 726 paddw xmm0, xmm1 727 psrlw xmm0, 2 728.hv_w4_loop: 729 movq xmm1, [srcq+ssq*1+0] 730 movq xmm2, [srcq+ssq*1+2] 731 lea srcq, [srcq+ssq*2] 732 movhps xmm1, [srcq+ssq*0+0] 733 movhps xmm2, [srcq+ssq*0+2] 734 pmullw xmm1, xm4 735 pmullw xmm2, xm5 736 paddw xmm1, xm6 737 paddw xmm1, xmm2 738 psrlw xmm1, 2 ; 1 2 739 shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1 740 mova xmm0, xmm1 741 psubw xmm1, xmm2 742 paddw xmm1, xmm1 743 pmulhw xmm1, xm7 744 paddw xmm1, xmm2 745 pmulhrsw xmm1, xm8 746 movq [dstq+dsq*0], xmm1 747 movhps [dstq+dsq*1], xmm1 748 lea dstq, [dstq+dsq*2] 749 sub hd, 2 750 jg .hv_w4_loop 751 RET 752.hv_w8: 753 pmullw xmm0, xm4, [srcq+ssq*0+0] 754 pmullw xmm1, xm5, [srcq+ssq*0+2] 755 paddw xmm0, xm6 756 paddw xmm0, xmm1 757 psrlw xmm0, 2 758 vinserti32x4 ym0, xmm0, 1 759.hv_w8_loop: 760 movu xm1, [srcq+ssq*1+0] 761 movu xm2, [srcq+ssq*1+2] 762 lea srcq, [srcq+ssq*2] 763 vinserti32x4 ym1, [srcq+ssq*0+0], 1 764 vinserti32x4 ym2, [srcq+ssq*0+2], 1 765 pmullw ym1, ym4 766 pmullw ym2, ym5 767 paddw ym1, ym6 768 paddw ym1, ym2 769 psrlw ym1, 2 ; 1 2 770 vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1 771 mova ym0, ym1 772 psubw ym1, ym2 773 paddw ym1, ym1 774 pmulhw ym1, ym7 775 paddw ym1, ym2 776 pmulhrsw ym1, ym8 777 mova [dstq+dsq*0], xm1 778 vextracti32x4 [dstq+dsq*1], ym1, 1 779 lea dstq, [dstq+dsq*2] 780 sub hd, 2 781 jg .hv_w8_loop 782 RET 783.hv_w16: 784 pmullw ym0, ym4, [srcq+ssq*0+0] 785 pmullw ym1, ym5, [srcq+ssq*0+2] 786 paddw ym0, ym6 787 paddw ym0, ym1 788 psrlw ym0, 2 789 vinserti32x8 m0, ym0, 1 790.hv_w16_loop: 791 movu ym1, [srcq+ssq*1+0] 792 movu ym2, [srcq+ssq*1+2] 793 lea srcq, [srcq+ssq*2] 794 vinserti32x8 m1, [srcq+ssq*0+0], 1 795 vinserti32x8 m2, [srcq+ssq*0+2], 1 796 pmullw m1, m4 797 pmullw m2, m5 798 paddw m1, m6 799 paddw m1, m2 800 psrlw m1, 2 ; 1 2 801 vshufi32x4 m2, m0, m1, q1032 ; 0 1 802 mova m0, m1 803 psubw m1, m2 804 paddw m1, m1 805 pmulhw m1, m7 806 paddw m1, m2 807 pmulhrsw m1, m8 808 mova [dstq+dsq*0], ym1 809 vextracti32x8 [dstq+dsq*1], m1, 1 810 lea dstq, [dstq+dsq*2] 811 sub hd, 2 812 jg .hv_w16_loop 813 RET 814.hv_w32: 815.hv_w64: 816.hv_w128: 817 movifnidn wd, wm 818 lea r6d, [hq+wq*8-256] 819 mov r4, srcq 820 mov r7, dstq 821.hv_w32_loop0: 822 pmullw m0, m4, [srcq+ssq*0+0] 823 pmullw m1, m5, [srcq+ssq*0+2] 824 paddw m0, m6 825 paddw m0, m1 826 psrlw m0, 2 827.hv_w32_loop: 828 pmullw m3, m4, [srcq+ssq*1+0] 829 pmullw m1, m5, [srcq+ssq*1+2] 830 lea srcq, [srcq+ssq*2] 831 paddw m3, m6 832 paddw m3, m1 833 psrlw m3, 2 834 psubw m1, m3, m0 835 paddw m1, m1 836 pmulhw m1, m7 837 paddw m1, m0 838 pmullw m0, m4, [srcq+ssq*0+0] 839 pmullw m2, m5, [srcq+ssq*0+2] 840 paddw m0, m6 841 paddw m0, m2 842 psrlw m0, 2 843 psubw m2, m0, m3 844 paddw m2, m2 845 pmulhw m2, m7 846 paddw m2, m3 847 pmulhrsw m1, m8 848 pmulhrsw m2, m8 849 mova [dstq+dsq*0], m1 850 mova [dstq+dsq*1], m2 851 lea dstq, [dstq+dsq*2] 852 sub hd, 2 853 jg .hv_w32_loop 854 add r4, 64 855 add r7, 64 856 movzx hd, r6b 857 mov srcq, r4 858 mov dstq, r7 859 sub r6d, 1<<8 860 jg .hv_w32_loop0 861 RET 862 863cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3 864 movifnidn mxyd, r5m ; mx 865 lea r6, [prep_avx512icl] 866 tzcnt wd, wm 867 movifnidn hd, hm 868 test mxyd, mxyd 869 jnz .h 870 mov mxyd, r6m ; my 871 test mxyd, mxyd 872 jnz .v 873.prep: 874 movzx wd, word [r6+wq*2+table_offset(prep,)] 875 mov r5d, r7m ; bitdepth_max 876 vpbroadcastd m5, [r6-prep_avx512icl+pw_8192] 877 add wq, r6 878 shr r5d, 11 879 vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4] 880 lea stride3q, [strideq*3] 881 jmp wq 882.prep_w4: 883 mov r3d, 0x0c 884 kmovb k1, r3d 885.prep_w4_loop: 886 movq xm0, [srcq+strideq*0] 887 movhps xm0, [srcq+strideq*1] 888 vpbroadcastq ym1, [srcq+strideq*2] 889 vpunpcklqdq ym0{k1}, ym1, [srcq+stride3q] {1to4} 890 lea srcq, [srcq+strideq*4] 891 pmullw ym0, ym4 892 psubw ym0, ym5 893 mova [tmpq], ym0 894 add tmpq, 32 895 sub hd, 4 896 jg .prep_w4_loop 897 RET 898.prep_w8: 899 movu xm0, [srcq+strideq*0] 900 vinserti32x4 ym0, [srcq+strideq*1], 1 901 vinserti32x4 m0, [srcq+strideq*2], 2 902 vinserti32x4 m0, [srcq+stride3q ], 3 903 lea srcq, [srcq+strideq*4] 904 pmullw m0, m4 905 psubw m0, m5 906 mova [tmpq], m0 907 add tmpq, 64 908 sub hd, 4 909 jg .prep_w8 910 RET 911.prep_w16: 912 movu ym0, [srcq+strideq*0] 913 vinserti32x8 m0, [srcq+strideq*1], 1 914 movu ym1, [srcq+strideq*2] 915 vinserti32x8 m1, [srcq+stride3q ], 1 916 lea srcq, [srcq+strideq*4] 917 pmullw m0, m4 918 pmullw m1, m4 919 psubw m0, m5 920 psubw m1, m5 921 mova [tmpq+64*0], m0 922 mova [tmpq+64*1], m1 923 add tmpq, 64*2 924 sub hd, 4 925 jg .prep_w16 926 RET 927.prep_w32: 928 pmullw m0, m4, [srcq+strideq*0] 929 pmullw m1, m4, [srcq+strideq*1] 930 pmullw m2, m4, [srcq+strideq*2] 931 pmullw m3, m4, [srcq+stride3q ] 932 lea srcq, [srcq+strideq*4] 933 REPX {psubw x, m5}, m0, m1, m2, m3 934 mova [tmpq+64*0], m0 935 mova [tmpq+64*1], m1 936 mova [tmpq+64*2], m2 937 mova [tmpq+64*3], m3 938 add tmpq, 64*4 939 sub hd, 4 940 jg .prep_w32 941 RET 942.prep_w64: 943 pmullw m0, m4, [srcq+strideq*0+64*0] 944 pmullw m1, m4, [srcq+strideq*0+64*1] 945 pmullw m2, m4, [srcq+strideq*1+64*0] 946 pmullw m3, m4, [srcq+strideq*1+64*1] 947 lea srcq, [srcq+strideq*2] 948 REPX {psubw x, m5}, m0, m1, m2, m3 949 mova [tmpq+64*0], m0 950 mova [tmpq+64*1], m1 951 mova [tmpq+64*2], m2 952 mova [tmpq+64*3], m3 953 add tmpq, 64*4 954 sub hd, 2 955 jg .prep_w64 956 RET 957.prep_w128: 958 pmullw m0, m4, [srcq+64*0] 959 pmullw m1, m4, [srcq+64*1] 960 pmullw m2, m4, [srcq+64*2] 961 pmullw m3, m4, [srcq+64*3] 962 add srcq, strideq 963 REPX {psubw x, m5}, m0, m1, m2, m3 964 mova [tmpq+64*0], m0 965 mova [tmpq+64*1], m1 966 mova [tmpq+64*2], m2 967 mova [tmpq+64*3], m3 968 add tmpq, 64*4 969 dec hd 970 jg .prep_w128 971 RET 972.h: 973 vpbroadcastw m5, mxyd 974 mov mxyd, r6m ; my 975 vpbroadcastd m4, [pw_16] 976 vpbroadcastd m6, [pw_32766] 977 psubw m4, m5 978 test dword r7m, 0x800 979 jnz .h_12bpc 980 psllw m4, 2 981 psllw m5, 2 982.h_12bpc: 983 test mxyd, mxyd 984 jnz .hv 985 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] 986 add wq, r6 987 lea stride3q, [strideq*3] 988 jmp wq 989.h_w4: 990 movu xm1, [srcq+strideq*0] 991 vinserti32x4 ym1, [srcq+strideq*2], 1 992 movu xm2, [srcq+strideq*1] 993 vinserti32x4 ym2, [srcq+stride3q ], 1 994 lea srcq, [srcq+strideq*4] 995 punpcklqdq ym0, ym1, ym2 996 psrldq ym1, 2 997 psrldq ym2, 2 998 pmullw ym0, ym4 999 punpcklqdq ym1, ym2 1000 pmullw ym1, ym5 1001 psubw ym0, ym6 1002 paddw ym0, ym1 1003 psraw ym0, 2 1004 mova [tmpq], ym0 1005 add tmpq, 32 1006 sub hd, 4 1007 jg .h_w4 1008 RET 1009.h_w8: 1010 movu xm0, [srcq+strideq*0+0] 1011 movu xm1, [srcq+strideq*0+2] 1012 vinserti32x4 ym0, [srcq+strideq*1+0], 1 1013 vinserti32x4 ym1, [srcq+strideq*1+2], 1 1014 vinserti32x4 m0, [srcq+strideq*2+0], 2 1015 vinserti32x4 m1, [srcq+strideq*2+2], 2 1016 vinserti32x4 m0, [srcq+stride3q +0], 3 1017 vinserti32x4 m1, [srcq+stride3q +2], 3 1018 lea srcq, [srcq+strideq*4] 1019 pmullw m0, m4 1020 pmullw m1, m5 1021 psubw m0, m6 1022 paddw m0, m1 1023 psraw m0, 2 1024 mova [tmpq], m0 1025 add tmpq, 64 1026 sub hd, 4 1027 jg .h_w8 1028 RET 1029.h_w16: 1030 movu ym0, [srcq+strideq*0+0] 1031 vinserti32x8 m0, [srcq+strideq*1+0], 1 1032 movu ym1, [srcq+strideq*0+2] 1033 vinserti32x8 m1, [srcq+strideq*1+2], 1 1034 lea srcq, [srcq+strideq*2] 1035 pmullw m0, m4 1036 pmullw m1, m5 1037 psubw m0, m6 1038 paddw m0, m1 1039 psraw m0, 2 1040 mova [tmpq], m0 1041 add tmpq, 64 1042 sub hd, 2 1043 jg .h_w16 1044 RET 1045.h_w32: 1046 pmullw m0, m4, [srcq+strideq*0+0] 1047 pmullw m2, m5, [srcq+strideq*0+2] 1048 pmullw m1, m4, [srcq+strideq*1+0] 1049 pmullw m3, m5, [srcq+strideq*1+2] 1050 lea srcq, [srcq+strideq*2] 1051 psubw m0, m6 1052 psubw m1, m6 1053 paddw m0, m2 1054 paddw m1, m3 1055 psraw m0, 2 1056 psraw m1, 2 1057 mova [tmpq+64*0], m0 1058 mova [tmpq+64*1], m1 1059 add tmpq, 64*2 1060 sub hd, 2 1061 jg .h_w32 1062 RET 1063.h_w64: 1064 pmullw m0, m4, [srcq+ 0] 1065 pmullw m2, m5, [srcq+ 2] 1066 pmullw m1, m4, [srcq+64] 1067 pmullw m3, m5, [srcq+66] 1068 add srcq, strideq 1069 psubw m0, m6 1070 psubw m1, m6 1071 paddw m0, m2 1072 paddw m1, m3 1073 psraw m0, 2 1074 psraw m1, 2 1075 mova [tmpq+64*0], m0 1076 mova [tmpq+64*1], m1 1077 add tmpq, 64*2 1078 dec hd 1079 jg .h_w64 1080 RET 1081.h_w128: 1082 pmullw m0, m4, [srcq+ 0] 1083 pmullw m7, m5, [srcq+ 2] 1084 pmullw m1, m4, [srcq+ 64] 1085 pmullw m8, m5, [srcq+ 66] 1086 pmullw m2, m4, [srcq+128] 1087 pmullw m9, m5, [srcq+130] 1088 pmullw m3, m4, [srcq+192] 1089 pmullw m10, m5, [srcq+194] 1090 add srcq, strideq 1091 REPX {psubw x, m6}, m0, m1, m2, m3 1092 paddw m0, m7 1093 paddw m1, m8 1094 paddw m2, m9 1095 paddw m3, m10 1096 REPX {psraw x, 2}, m0, m1, m2, m3 1097 mova [tmpq+64*0], m0 1098 mova [tmpq+64*1], m1 1099 mova [tmpq+64*2], m2 1100 mova [tmpq+64*3], m3 1101 add tmpq, 64*4 1102 dec hd 1103 jg .h_w128 1104 RET 1105.v: 1106 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] 1107 vpbroadcastw m9, mxyd 1108 vpbroadcastd m8, [pw_16] 1109 vpbroadcastd m10, [pw_32766] 1110 add wq, r6 1111 lea stride3q, [strideq*3] 1112 psubw m8, m9 1113 test dword r7m, 0x800 1114 jnz .v_12bpc 1115 psllw m8, 2 1116 psllw m9, 2 1117.v_12bpc: 1118 jmp wq 1119.v_w4: 1120 movq xmm0, [srcq+strideq*0] 1121.v_w4_loop: 1122 vpbroadcastq xmm2, [srcq+strideq*1] 1123 vpbroadcastq ymm1, [srcq+strideq*2] 1124 vpbroadcastq ymm3, [srcq+stride3q ] 1125 lea srcq, [srcq+strideq*4] 1126 vpblendd ymm2, ymm1, 0x30 1127 vpblendd ymm2, ymm3, 0xc0 1128 vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3 1129 movq xmm0, [srcq+strideq*0] 1130 valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4 1131 pmullw ymm1, ym8 1132 pmullw ymm2, ym9 1133 psubw ymm1, ym10 1134 paddw ymm1, ymm2 1135 psraw ymm1, 2 1136 mova [tmpq], ymm1 1137 add tmpq, 32 1138 sub hd, 4 1139 jg .v_w4_loop 1140 vzeroupper 1141 RET 1142.v_w8: 1143 movu xm0, [srcq+strideq*0] 1144.v_w8_loop: 1145 vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 1146 vinserti32x4 m1, [srcq+strideq*2], 2 1147 vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3 1148 lea srcq, [srcq+strideq*4] 1149 movu xm0, [srcq+strideq*0] 1150 valignq m2, m0, m1, 2 ; 1 2 3 4 1151 pmullw m1, m8 1152 pmullw m2, m9 1153 psubw m1, m10 1154 paddw m1, m2 1155 psraw m1, 2 1156 mova [tmpq], m1 1157 add tmpq, 64 1158 sub hd, 4 1159 jg .v_w8_loop 1160 RET 1161.v_w16: 1162 movu ym0, [srcq+strideq*0] 1163.v_w16_loop: 1164 vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1 1165 movu ym3, [srcq+strideq*2] 1166 vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3 1167 lea srcq, [srcq+strideq*4] 1168 movu ym0, [srcq+strideq*0] 1169 vshufi32x4 m3, m1, m3, q1032 ; 1 2 1170 vshufi32x4 m4, m2, m0, q1032 ; 3 4 1171 pmullw m1, m8 1172 pmullw m2, m8 1173 pmullw m3, m9 1174 pmullw m4, m9 1175 psubw m1, m10 1176 psubw m2, m10 1177 paddw m1, m3 1178 paddw m2, m4 1179 psraw m1, 2 1180 psraw m2, 2 1181 mova [tmpq+64*0], m1 1182 mova [tmpq+64*1], m2 1183 add tmpq, 64*2 1184 sub hd, 4 1185 jg .v_w16_loop 1186 RET 1187.v_w32: 1188 movu m0, [srcq+strideq*0] 1189.v_w32_loop: 1190 movu m3, [srcq+strideq*1] 1191 lea srcq, [srcq+strideq*2] 1192 pmullw m1, m8, m0 1193 movu m0, [srcq+strideq*0] 1194 pmullw m2, m8, m3 1195 pmullw m3, m9 1196 pmullw m4, m9, m0 1197 psubw m1, m10 1198 psubw m2, m10 1199 paddw m1, m3 1200 paddw m2, m4 1201 psraw m1, 2 1202 psraw m2, 2 1203 mova [tmpq+64*0], m1 1204 mova [tmpq+64*1], m2 1205 add tmpq, 64*2 1206 sub hd, 2 1207 jg .v_w32_loop 1208 RET 1209.v_w64: 1210 movu m0, [srcq+64*0] 1211 movu m1, [srcq+64*1] 1212.v_w64_loop: 1213 add srcq, strideq 1214 pmullw m2, m8, m0 1215 movu m0, [srcq+64*0] 1216 pmullw m3, m8, m1 1217 movu m1, [srcq+64*1] 1218 pmullw m4, m9, m0 1219 pmullw m5, m9, m1 1220 psubw m2, m10 1221 psubw m3, m10 1222 paddw m2, m4 1223 paddw m3, m5 1224 psraw m2, 2 1225 psraw m3, 2 1226 mova [tmpq+64*0], m2 1227 mova [tmpq+64*1], m3 1228 add tmpq, 64*2 1229 dec hd 1230 jg .v_w64_loop 1231 RET 1232.v_w128: 1233 movu m0, [srcq+64*0] 1234 movu m1, [srcq+64*1] 1235 movu m2, [srcq+64*2] 1236 movu m3, [srcq+64*3] 1237.v_w128_loop: 1238 add srcq, strideq 1239 pmullw m4, m8, m0 1240 movu m0, [srcq+64*0] 1241 pmullw m5, m8, m1 1242 movu m1, [srcq+64*1] 1243 pmullw m6, m8, m2 1244 movu m2, [srcq+64*2] 1245 pmullw m7, m8, m3 1246 movu m3, [srcq+64*3] 1247 pmullw m11, m9, m0 1248 pmullw m12, m9, m1 1249 pmullw m13, m9, m2 1250 pmullw m14, m9, m3 1251 REPX {psubw x, m10}, m4, m5, m6, m7 1252 paddw m4, m11 1253 paddw m5, m12 1254 paddw m6, m13 1255 paddw m7, m14 1256 REPX {psraw x, 2}, m4, m5, m6, m7 1257 mova [tmpq+64*0], m4 1258 mova [tmpq+64*1], m5 1259 mova [tmpq+64*2], m6 1260 mova [tmpq+64*3], m7 1261 add tmpq, 64*4 1262 dec hd 1263 jg .v_w128_loop 1264 RET 1265.hv: 1266 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] 1267 shl mxyd, 11 1268 vpbroadcastw m7, mxyd 1269 add wq, r6 1270 lea stride3q, [strideq*3] 1271 jmp wq 1272.hv_w4: 1273 movq xmm0, [srcq+strideq*0+0] 1274 movq xmm1, [srcq+strideq*0+2] 1275 pmullw xmm0, xm4 1276 pmullw xmm1, xm5 1277 psubw xmm0, xm6 1278 paddw xmm0, xmm1 1279 psraw xmm0, 2 1280 vpbroadcastq ym0, xmm0 1281.hv_w4_loop: 1282 movu xm1, [srcq+strideq*1] 1283 vinserti128 ym1, [srcq+stride3q ], 1 1284 movu xm2, [srcq+strideq*2] 1285 lea srcq, [srcq+strideq*4] 1286 vinserti128 ym2, [srcq+strideq*0], 1 1287 punpcklqdq ym3, ym1, ym2 1288 psrldq ym1, 2 1289 psrldq ym2, 2 1290 pmullw ym3, ym4 1291 punpcklqdq ym1, ym2 1292 pmullw ym1, ym5 1293 psubw ym3, ym6 1294 paddw ym1, ym3 1295 psraw ym1, 2 ; 1 2 3 4 1296 valignq ym2, ym1, ym0, 3 ; 0 1 2 3 1297 mova ym0, ym1 1298 psubw ym1, ym2 1299 pmulhrsw ym1, ym7 1300 paddw ym1, ym2 1301 mova [tmpq], ym1 1302 add tmpq, 32 1303 sub hd, 4 1304 jg .hv_w4_loop 1305 RET 1306.hv_w8: 1307 pmullw xm0, xm4, [srcq+strideq*0+0] 1308 pmullw xm1, xm5, [srcq+strideq*0+2] 1309 psubw xm0, xm6 1310 paddw xm0, xm1 1311 psraw xm0, 2 1312 vinserti32x4 m0, xm0, 3 1313.hv_w8_loop: 1314 movu xm1, [srcq+strideq*1+0] 1315 movu xm2, [srcq+strideq*1+2] 1316 vinserti32x4 ym1, [srcq+strideq*2+0], 1 1317 vinserti32x4 ym2, [srcq+strideq*2+2], 1 1318 vinserti32x4 m1, [srcq+stride3q +0], 2 1319 vinserti32x4 m2, [srcq+stride3q +2], 2 1320 lea srcq, [srcq+strideq*4] 1321 vinserti32x4 m1, [srcq+strideq*0+0], 3 1322 vinserti32x4 m2, [srcq+strideq*0+2], 3 1323 pmullw m1, m4 1324 pmullw m2, m5 1325 psubw m1, m6 1326 paddw m1, m2 1327 psraw m1, 2 ; 1 2 3 4 1328 valignq m2, m1, m0, 6 ; 0 1 2 3 1329 mova m0, m1 1330 psubw m1, m2 1331 pmulhrsw m1, m7 1332 paddw m1, m2 1333 mova [tmpq], m1 1334 add tmpq, 64 1335 sub hd, 4 1336 jg .hv_w8_loop 1337 RET 1338.hv_w16: 1339 pmullw ym0, ym4, [srcq+strideq*0+0] 1340 pmullw ym1, ym5, [srcq+strideq*0+2] 1341 psubw ym0, ym6 1342 paddw ym0, ym1 1343 psraw ym0, 2 1344 vinserti32x8 m0, ym0, 1 1345.hv_w16_loop: 1346 movu ym1, [srcq+strideq*1+0] 1347 movu ym2, [srcq+strideq*1+2] 1348 lea srcq, [srcq+strideq*2] 1349 vinserti32x8 m1, [srcq+strideq*0+0], 1 1350 vinserti32x8 m2, [srcq+strideq*0+2], 1 1351 pmullw m1, m4 1352 pmullw m2, m5 1353 psubw m1, m6 1354 paddw m1, m2 1355 psraw m1, 2 ; 1 2 1356 vshufi32x4 m2, m0, m1, q1032 ; 0 1 1357 mova m0, m1 1358 psubw m1, m2 1359 pmulhrsw m1, m7 1360 paddw m1, m2 1361 mova [tmpq], m1 1362 add tmpq, 64 1363 sub hd, 2 1364 jg .hv_w16_loop 1365 RET 1366.hv_w32: 1367 pmullw m0, m4, [srcq+strideq*0+0] 1368 pmullw m1, m5, [srcq+strideq*0+2] 1369 psubw m0, m6 1370 paddw m0, m1 1371 psraw m0, 2 1372.hv_w32_loop: 1373 pmullw m3, m4, [srcq+strideq*1+0] 1374 pmullw m1, m5, [srcq+strideq*1+2] 1375 lea srcq, [srcq+strideq*2] 1376 psubw m3, m6 1377 paddw m3, m1 1378 psraw m3, 2 1379 psubw m1, m3, m0 1380 pmulhrsw m1, m7 1381 paddw m1, m0 1382 pmullw m0, m4, [srcq+strideq*0+0] 1383 pmullw m2, m5, [srcq+strideq*0+2] 1384 psubw m0, m6 1385 paddw m0, m2 1386 psraw m0, 2 1387 psubw m2, m0, m3 1388 pmulhrsw m2, m7 1389 paddw m2, m3 1390 mova [tmpq+64*0], m1 1391 mova [tmpq+64*1], m2 1392 add tmpq, 64*2 1393 sub hd, 2 1394 jg .hv_w32_loop 1395 RET 1396.hv_w64: 1397 pmullw m0, m4, [srcq+ 0] 1398 pmullw m2, m5, [srcq+ 2] 1399 pmullw m1, m4, [srcq+64] 1400 pmullw m3, m5, [srcq+66] 1401 psubw m0, m6 1402 psubw m1, m6 1403 paddw m0, m2 1404 paddw m1, m3 1405 psraw m0, 2 1406 psraw m1, 2 1407.hv_w64_loop: 1408 add srcq, strideq 1409 pmullw m2, m4, [srcq+ 0] 1410 pmullw m8, m5, [srcq+ 2] 1411 pmullw m3, m4, [srcq+64] 1412 pmullw m9, m5, [srcq+66] 1413 psubw m2, m6 1414 psubw m3, m6 1415 paddw m2, m8 1416 paddw m3, m9 1417 psraw m2, 2 1418 psraw m3, 2 1419 psubw m8, m2, m0 1420 psubw m9, m3, m1 1421 pmulhrsw m8, m7 1422 pmulhrsw m9, m7 1423 paddw m8, m0 1424 mova m0, m2 1425 paddw m9, m1 1426 mova m1, m3 1427 mova [tmpq+64*0], m8 1428 mova [tmpq+64*1], m9 1429 add tmpq, 64*2 1430 dec hd 1431 jg .hv_w64_loop 1432 RET 1433.hv_w128: 1434 pmullw m0, m4, [srcq+ 0] 1435 pmullw m8, m5, [srcq+ 2] 1436 pmullw m1, m4, [srcq+ 64] 1437 pmullw m9, m5, [srcq+ 66] 1438 pmullw m2, m4, [srcq+128] 1439 pmullw m10, m5, [srcq+130] 1440 pmullw m3, m4, [srcq+192] 1441 pmullw m11, m5, [srcq+194] 1442 REPX {psubw x, m6}, m0, m1, m2, m3 1443 paddw m0, m8 1444 paddw m1, m9 1445 paddw m2, m10 1446 paddw m3, m11 1447 REPX {psraw x, 2}, m0, m1, m2, m3 1448.hv_w128_loop: 1449 add srcq, strideq 1450 pmullw m8, m4, [srcq+ 0] 1451 pmullw m12, m5, [srcq+ 2] 1452 pmullw m9, m4, [srcq+ 64] 1453 pmullw m13, m5, [srcq+ 66] 1454 pmullw m10, m4, [srcq+128] 1455 pmullw m14, m5, [srcq+130] 1456 pmullw m11, m4, [srcq+192] 1457 pmullw m15, m5, [srcq+194] 1458 REPX {psubw x, m6}, m8, m9, m10, m11 1459 paddw m8, m12 1460 paddw m9, m13 1461 paddw m10, m14 1462 paddw m11, m15 1463 REPX {psraw x, 2}, m8, m9, m10, m11 1464 psubw m12, m8, m0 1465 psubw m13, m9, m1 1466 psubw m14, m10, m2 1467 psubw m15, m11, m3 1468 REPX {pmulhrsw x, m7}, m12, m13, m14, m15 1469 paddw m12, m0 1470 mova m0, m8 1471 paddw m13, m1 1472 mova m1, m9 1473 mova [tmpq+64*0], m12 1474 mova [tmpq+64*1], m13 1475 paddw m14, m2 1476 mova m2, m10 1477 paddw m15, m3 1478 mova m3, m11 1479 mova [tmpq+64*2], m14 1480 mova [tmpq+64*3], m15 1481 add tmpq, 64*4 1482 dec hd 1483 jg .hv_w128_loop 1484 RET 1485 1486; int8_t subpel_filters[5][15][8] 1487%assign FILTER_REGULAR (0*15 << 16) | 3*15 1488%assign FILTER_SMOOTH (1*15 << 16) | 4*15 1489%assign FILTER_SHARP (2*15 << 16) | 3*15 1490 1491%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to 1492cglobal %1_%2_16bpc 1493 mov t0d, FILTER_%3 1494%ifidn %3, %4 1495 mov t1d, t0d 1496%else 1497 mov t1d, FILTER_%4 1498%endif 1499%if %0 == 5 ; skip the jump in the last filter 1500 jmp mangle(private_prefix %+ _%5 %+ SUFFIX) 1501%endif 1502%endmacro 1503 1504%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v 1505cglobal %1_8tap_%2_16bpc 1506 mov t0d, FILTER_%3 1507%ifidn %3, %4 1508 mov t1d, t0d 1509%else 1510 mov t1d, FILTER_%4 1511%endif 1512%ifnidn %2, regular ; skip the jump in the last filter 1513 jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) 1514%endif 1515%endmacro 1516 1517%if WIN64 1518DECLARE_REG_TMP 4, 5 1519%define buf rsp+stack_offset+8 ; shadow space 1520%else 1521DECLARE_REG_TMP 7, 8 1522%define buf rsp-40 ; red zone 1523%endif 1524 1525%define PUT_8TAP_FN FN put_8tap, 1526PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc 1527PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc 1528PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc 1529PUT_8TAP_FN regular, REGULAR, REGULAR 1530 1531cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my 1532%define base r8-put_avx512icl 1533 imul mxd, mxm, 0x010101 1534 add mxd, t0d ; 6tap_h, mx, 4tap_h 1535 imul myd, mym, 0x010101 1536 add myd, t1d ; 6tap_v, my, 4tap_v 1537 lea r8, [put_avx512icl] 1538 movifnidn wd, wm 1539 movifnidn hd, hm 1540 test mxd, 0xf00 1541 jnz .h 1542 test myd, 0xf00 1543 jnz .v 1544.put: 1545 tzcnt wd, wd 1546 movzx wd, word [r8+wq*2+table_offset(put,)] 1547 add wq, r8 1548%if WIN64 1549 pop r8 1550%endif 1551 jmp wq 1552.h_w8: 1553 mova m4, [spel_h_shufA] 1554 movu m5, [spel_h_shufB] 1555 movu m6, [spel_h_shufC] 1556.h_w8_loop: 1557 movu ym2, [srcq+ssq*0] 1558 vinserti32x8 m2, [srcq+ssq*1], 1 1559 lea srcq, [srcq+ssq*2] 1560 mova m0, m8 1561 vpermb m1, m4, m2 1562 vpdpwssd m0, m10, m1 1563 vpermb m1, m5, m2 1564 vpdpwssd m0, m11, m1 1565 vpermb m1, m6, m2 1566 vpdpwssd m0, m12, m1 1567 psrad m0, 6 1568 vextracti32x8 ym1, m0, 1 1569 packusdw ym0, ym1 1570 pminsw ym0, ym15 1571 mova [dstq+dsq*0], xm0 1572 vextracti32x4 [dstq+dsq*1], ym0, 1 1573 lea dstq, [dstq+dsq*2] 1574 sub hd, 2 1575 jg .h_w8_loop 1576 RET 1577.h: 1578 vpbroadcastw m15, r8m 1579 test myd, 0xf00 1580 jnz .hv 1581 mov r7d, r8m 1582 shr r7d, 11 1583 vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] 1584 cmp wd, 4 1585 jle mangle(private_prefix %+ _put_8tap_16bpc_avx512icl).h_w4 1586 shr mxd, 16 1587 sub srcq, 4 1588 pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] 1589 mova [buf], xmm0 1590 vpbroadcastd m10, xmm0 1591 vpbroadcastd m12, [buf+8] 1592 vpbroadcastd m11, [buf+4] 1593 sub wd, 16 1594 jl .h_w8 1595 vbroadcasti32x4 m6, [spel_h_shufA] 1596 vbroadcasti32x4 m7, [spel_h_shufB] 1597 jg .h_w32 1598.h_w16_loop: 1599 movu ym2, [srcq+ssq*0+ 0] 1600 vinserti32x8 m2, [srcq+ssq*1+ 0], 1 1601 movu ym3, [srcq+ssq*0+12] 1602 vinserti32x8 m3, [srcq+ssq*1+12], 1 1603 lea srcq, [srcq+ssq*2] 1604 mova m0, m8 1605 mova m1, m8 1606 pshufb m4, m2, m6 1607 vpdpwssd m0, m10, m4 ; a0 b0 1608 pshufb m4, m3, m7 1609 vpdpwssd m1, m12, m4 ; a2' b2' 1610 pshufb m2, m7 1611 pshufb m3, m6 1612 vpdpwssd m0, m11, m2 ; a1 b1 1613 vpdpwssd m1, m11, m3 ; a1' b1' 1614 shufpd m2, m3, 0x55 1615 vpdpwssd m0, m12, m2 ; a2 b2 1616 vpdpwssd m1, m10, m2 ; a0' b0' 1617 psrad m0, 6 1618 psrad m1, 6 1619 packusdw m0, m1 1620 pminsw m0, m15 1621 mova [dstq+dsq*0], ym0 1622 vextracti32x8 [dstq+dsq*1], m0, 1 1623 lea dstq, [dstq+dsq*2] 1624 sub hd, 2 1625 jg .h_w16_loop 1626 RET 1627.h_w32: 1628 lea srcq, [srcq+wq*2] 1629 lea dstq, [dstq+wq*2] 1630 neg wq 1631.h_w32_loop0: 1632 mov r6, wq 1633.h_w32_loop: 1634 movu m2, [srcq+r6*2+ 0] 1635 movu m3, [srcq+r6*2+12] 1636 mova m0, m8 1637 mova m1, m8 1638 pshufb m4, m2, m6 1639 vpdpwssd m0, m10, m4 ; a0 1640 pshufb m4, m3, m7 1641 vpdpwssd m1, m12, m4 ; b2 1642 pshufb m2, m7 1643 pshufb m3, m6 1644 vpdpwssd m0, m11, m2 ; a1 1645 vpdpwssd m1, m11, m3 ; b1 1646 shufpd m2, m3, 0x55 1647 vpdpwssd m0, m12, m2 ; a2 1648 vpdpwssd m1, m10, m2 ; b0 1649 psrad m0, 6 1650 psrad m1, 6 1651 packusdw m0, m1 1652 pminsw m0, m15 1653 mova [dstq+r6*2], m0 1654 add r6, 32 1655 jl .h_w32_loop 1656 add srcq, ssq 1657 add dstq, dsq 1658 dec hd 1659 jg .h_w32_loop0 1660 RET 1661.v: 1662 movzx mxd, myb 1663 shr myd, 16 1664 cmp hd, 6 1665 cmovs myd, mxd 1666 vpbroadcastd m11, [pd_32] 1667 pmovsxbw xmm0, [base+subpel_filters+1+myq*8] 1668 tzcnt r7d, wd 1669 vpbroadcastw m15, r8m 1670 mov r6, ssq 1671 movzx r7d, word [r8+r7*2+table_offset(put, _6tap_v)] 1672 neg r6 1673 mova [rsp+stack_offset+8], xmm0 1674 vpbroadcastd m12, xmm0 1675 add r7, r8 1676 vpbroadcastd m13, [rsp+stack_offset+12] 1677 vpbroadcastd m14, [rsp+stack_offset+16] 1678 jmp r7 1679.v_w2: 1680 movd xmm2, [srcq+r6 *2] 1681 pinsrd xmm2, [srcq+r6 *1], 1 1682 pinsrd xmm2, [srcq+ssq*0], 2 1683 pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3 1684 lea srcq, [srcq+ssq*2] 1685 movd xmm0, [srcq+ssq*0] 1686 palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 1687 punpcklwd xmm1, xmm2, xmm3 ; 01 12 1688 punpckhwd xmm2, xmm3 ; 23 34 1689.v_w2_loop: 1690 movd xmm3, [srcq+ssq*1] 1691 mova xmm4, xm11 1692 vpdpwssd xmm4, xmm1, xm12 ; a0 b0 1693 lea srcq, [srcq+ssq*2] 1694 mova xmm1, xmm2 1695 vpdpwssd xmm4, xmm2, xm13 ; a1 b1 1696 punpckldq xmm2, xmm0, xmm3 ; 4 5 1697 movd xmm0, [srcq+ssq*0] 1698 punpckldq xmm3, xmm0 ; 5 6 1699 punpcklwd xmm2, xmm3 ; 45 56 1700 vpdpwssd xmm4, xmm2, xm14 ; a2 b2 1701 psrad xmm4, 6 1702 packusdw xmm4, xmm4 1703 pminsw xmm4, xm15 1704 movd [dstq+dsq*0], xmm4 1705 pextrd [dstq+dsq*1], xmm4, 1 1706 lea dstq, [dstq+dsq*2] 1707 sub hd, 2 1708 jg .v_w2_loop 1709 RET 1710.v_w4: 1711 movq xmm1, [srcq+r6 *2] 1712 vpbroadcastq ymm3, [srcq+r6 *1] 1713 vpbroadcastq ymm2, [srcq+ssq*0] 1714 vpbroadcastq ymm4, [srcq+ssq*1] 1715 lea srcq, [srcq+ssq*2] 1716 vpbroadcastq ymm0, [srcq+ssq*0] 1717 vpblendd ymm1, ymm3, 0x30 1718 vpblendd ymm3, ymm2, 0x30 1719 punpcklwd ymm1, ymm3 ; 01 12 1720 vpblendd ymm2, ymm4, 0x30 1721 vpblendd ymm4, ymm0, 0x30 1722 punpcklwd ymm2, ymm4 ; 23 34 1723.v_w4_loop: 1724 vpbroadcastq ymm3, [srcq+ssq*1] 1725 mova ymm4, ym11 1726 vpdpwssd ymm4, ymm1, ym12 ; a0 b0 1727 lea srcq, [srcq+ssq*2] 1728 mova ymm1, ymm2 1729 vpdpwssd ymm4, ymm2, ym13 ; a1 b1 1730 vpblendd ymm2, ymm0, ymm3, 0x30 1731 vpbroadcastq ymm0, [srcq+ssq*0] 1732 vpblendd ymm3, ymm0, 0x30 1733 punpcklwd ymm2, ymm3 ; 45 56 1734 vpdpwssd ymm4, ymm2, ym14 ; a2 b2 1735 psrad ymm4, 6 1736 vextracti128 xmm3, ymm4, 1 1737 packusdw xmm4, xmm3 1738 pminsw xmm4, xm15 1739 movq [dstq+dsq*0], xmm4 1740 movhps [dstq+dsq*1], xmm4 1741 lea dstq, [dstq+dsq*2] 1742 sub hd, 2 1743 jg .v_w4_loop 1744 vzeroupper 1745 RET 1746.v_w8: 1747 vbroadcasti32x4 m0, [srcq+ssq*0] 1748 vinserti32x4 m1, m0, [srcq+r6 *2], 0 1749 vinserti32x4 m1, [srcq+r6 *1], 1 ; 0 1 2 1750 vinserti32x4 ym0, [srcq+ssq*1], 1 1751 lea srcq, [srcq+ssq*2] 1752 mova m5, [spel_v_shuf8] 1753 vinserti32x4 m0, [srcq+ssq*0], 2 ; 2 3 4 1754 vpermb m1, m5, m1 ; 01 12 1755 vpermb m2, m5, m0 ; 23 34 1756.v_w8_loop: 1757 vinserti32x4 m0, [srcq+ssq*1], 3 1758 lea srcq, [srcq+ssq*2] 1759 movu xm3, [srcq+ssq*0] 1760 mova m4, m11 1761 vpdpwssd m4, m12, m1 ; a0 b0 1762 vshufi32x4 m0, m3, q1032 ; 4 5 6 1763 mova m1, m2 1764 vpdpwssd m4, m13, m2 ; a1 b1 1765 vpermb m2, m5, m0 ; 45 56 1766 vpdpwssd m4, m14, m2 ; a2 b2 1767 psrad m4, 6 1768 vextracti32x8 ym3, m4, 1 1769 packusdw ym4, ym3 1770 pminsw ym4, ym15 1771 mova [dstq+dsq*0], xm4 1772 vextracti32x4 [dstq+dsq*1], ym4, 1 1773 lea dstq, [dstq+dsq*2] 1774 sub hd, 2 1775 jg .v_w8_loop 1776 RET 1777.v_w16: 1778 vbroadcasti32x8 m0, [srcq+r6 *1] 1779 vinserti32x8 m1, m0, [srcq+ssq*0], 1 1780 vinserti32x8 m0, [srcq+r6*2], 0 1781 mova m6, [spel_v_shuf16] 1782 movu ym3, [srcq+ssq*1] 1783 lea srcq, [srcq+ssq*2] 1784 vinserti32x8 m3, [srcq+ssq*0], 1 1785 vpermb m1, m6, m1 ; 12 1786 vpermb m0, m6, m0 ; 01 1787 vpermb m3, m6, m3 ; 34 1788 mova m7, [deint_q_shuf] 1789 vpshrdd m2, m1, m3, 16 ; 23 1790.v_w16_loop: 1791 mova m5, m11 1792 vpdpwssd m5, m12, m1 ; b0 1793 mova m4, m11 1794 vpdpwssd m4, m12, m0 ; a0 1795 mova m1, m3 1796 vpdpwssd m5, m13, m3 ; b1 1797 mova m0, m2 1798 vpdpwssd m4, m13, m2 ; a1 1799 movu ym3, [srcq+ssq*1] 1800 lea srcq, [srcq+ssq*2] 1801 vinserti32x8 m3, [srcq+ssq*0], 1 1802 vpermb m3, m6, m3 ; 56 1803 vpshrdd m2, m1, m3, 16 ; 45 1804 vpdpwssd m5, m14, m3 ; b2 1805 vpdpwssd m4, m14, m2 ; a2 1806 psrad m5, 6 1807 psrad m4, 6 1808 packusdw m4, m5 1809 pminsw m4, m15 1810 vpermq m4, m7, m4 1811 mova [dstq+dsq*0], ym4 1812 vextracti32x8 [dstq+dsq*1], m4, 1 1813 lea dstq, [dstq+dsq*2] 1814 sub hd, 2 1815 jg .v_w16_loop 1816 RET 1817.v_w32: 1818.v_w64: 1819.v_w128: 1820 lea wd, [hq+wq*8-256] 1821.v_w32_loop0: 1822 movu m16, [srcq+r6 *2] 1823 movu m17, [srcq+r6 *1] 1824 lea r7, [srcq+ssq*2] 1825 movu m18, [srcq+ssq*0] 1826 movu m19, [srcq+ssq*1] 1827 mov r8, dstq 1828 movu m20, [r7 +ssq*0] 1829 punpcklwd m0, m16, m17 ; 01 1830 punpckhwd m16, m17 1831 punpcklwd m1, m17, m18 ; 12 1832 punpckhwd m17, m18 1833 punpcklwd m2, m18, m19 ; 23 1834 punpckhwd m18, m19 1835 punpcklwd m3, m19, m20 ; 34 1836 punpckhwd m19, m20 1837.v_w32_loop: 1838 mova m4, m11 1839 vpdpwssd m4, m12, m0 ; a0 1840 mova m6, m11 1841 vpdpwssd m6, m12, m16 1842 mova m5, m11 1843 vpdpwssd m5, m12, m1 ; b0 1844 mova m7, m11 1845 vpdpwssd m7, m12, m17 1846 mova m0, m2 1847 vpdpwssd m4, m13, m2 ; a1 1848 mova m16, m18 1849 vpdpwssd m6, m13, m18 1850 mova m1, m3 1851 vpdpwssd m5, m13, m3 ; b1 1852 mova m17, m19 1853 vpdpwssd m7, m13, m19 1854 movu m19, [r7+ssq*1] 1855 lea r7, [r7+ssq*2] 1856 punpcklwd m2, m20, m19 ; 45 1857 punpckhwd m18, m20, m19 1858 movu m20, [r7+ssq*0] 1859 vpdpwssd m4, m14, m2 ; a2 1860 vpdpwssd m6, m14, m18 1861 punpcklwd m3, m19, m20 ; 56 1862 punpckhwd m19, m20 1863 vpdpwssd m5, m14, m3 ; b2 1864 vpdpwssd m7, m14, m19 1865 REPX {psrad x, 6}, m4, m6, m5, m7 1866 packusdw m4, m6 1867 packusdw m5, m7 1868 pminsw m4, m15 1869 pminsw m5, m15 1870 mova [r8+dsq*0], m4 1871 mova [r8+dsq*1], m5 1872 lea r8, [r8+dsq*2] 1873 sub hd, 2 1874 jg .v_w32_loop 1875 add srcq, 64 1876 add dstq, 64 1877 movzx hd, wb 1878 sub wd, 1<<8 1879 jg .v_w32_loop0 1880 vzeroupper 1881 RET 1882.hv: 1883 cmp wd, 4 1884 jg .hv_w8 1885 movzx mxd, mxb 1886 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 1887 movzx mxd, myb 1888 shr myd, 16 1889 cmp hd, 6 1890 cmovs myd, mxd 1891 pmovsxbw xmm1, [base+subpel_filters+1+myq*8] 1892 mov r6, ssq 1893 sub srcq, 2 1894 neg r6 1895 test dword r8m, 0x800 1896 jnz .hv_12bit 1897 vpbroadcastd m10, [pd_2176] 1898 psllw xmm0, 6 1899 jmp .hv_main 1900.hv_12bit: 1901 vpbroadcastd m10, [pd_640] 1902 psllw xmm0, 4 1903 psllw xmm1, 2 1904.hv_main: 1905 movu xm4, [srcq+r6 *2] 1906 vinserti32x4 ym4, [srcq+r6 *1], 1 1907 vinserti32x4 m4, [srcq+ssq*0], 2 1908 vbroadcasti32x4 m6, [spel_h_shufA] 1909 vinserti32x4 m4, [srcq+ssq*1], 3 ; 0 1 2 3 1910 lea srcq, [srcq+ssq*2] 1911 movu xm5, [srcq+ssq*0] ; 4 1912 mova [buf+ 0], xmm0 1913 mova [buf+16], xmm1 1914 vpbroadcastd m8, [buf+ 4] 1915 vpbroadcastd m9, [buf+ 8] 1916 vpbroadcastd ym12, xmm1 1917 vpbroadcastd ym13, [buf+20] 1918 vpbroadcastd ym14, [buf+24] 1919 cmp wd, 4 1920 je .hv_w4 1921 vbroadcasti32x4 m2, [spel_h_shufA] 1922 mova m3, [spel_h_shuf2b] 1923 mova m1, m10 1924 pshufb m4, m6 1925 pshufb xm5, xm6 1926 punpcklqdq m2, m4, m5 1927 vpdpwssd m1, m8, m2 ; 04 1_ 2_ 3_ 1928 mova ym6, [spel_h_shuf2a] 1929 punpckhqdq m4, m5 1930 mova xm5, [spel_shuf2] 1931 vpdpwssd m1, m9, m4 1932 vpermb m1, m3, m1 ; 01 12 1933 vextracti32x4 xm2, ym1, 1 ; 23 34 1934.hv_w2_loop: 1935 movu xm3, [srcq+ssq*1] 1936 lea srcq, [srcq+ssq*2] 1937 vinserti32x4 ym3, [srcq+ssq*0], 1 1938 vpermb ym3, ym6, ym3 1939 pmaddwd xmm0, xm12, xm1 ; a0 b0 1940 mova xm4, xm10 1941 vpdpwssd xm4, xm8, xm3 1942 vextracti32x4 xm3, ym3, 1 1943 mova xm1, xm2 1944 vpdpwssd xmm0, xm13, xm2 ; a1 b1 1945 vpdpwssd xm4, xm9, xm3 ; 5 6 1946 vpermt2b xm2, xm5, xm4 ; 45 56 1947 vpdpwssd xmm0, xm14, xm2 ; a2 b2 1948 psrad xmm0, 10 1949 packusdw xmm0, xmm0 1950 pminsw xmm0, xm15 1951 movd [dstq+dsq*0], xmm0 1952 pextrd [dstq+dsq*1], xmm0, 1 1953 lea dstq, [dstq+dsq*2] 1954 sub hd, 2 1955 jg .hv_w2_loop 1956 RET 1957.hv_w4: 1958 vbroadcasti32x4 m7, [spel_h_shufB] 1959 mova ym0, [spel_shuf4a] 1960 pshufb m1, m4, m6 1961 mova m2, m10 1962 vpdpwssd m2, m8, m1 1963 pshufb xm1, xm5, xm6 1964 mova xm3, xm10 1965 vpdpwssd xm3, xm8, xm1 1966 pshufb m4, m7 1967 pshufb xm5, xm7 1968 vpdpwssd m2, m9, m4 ; 0 1 2 3 1969 vpdpwssd xm3, xm9, xm5 ; 4 1970 mova ym5, [spel_shuf4b] 1971 vpermb m1, m0, m2 ; 01 12 1972 vshufi32x4 m2, m3, q1032 ; 2 3 4 1973 vpermb m2, m0, m2 ; 23 34 1974.hv_w4_loop: 1975 movu xm3, [srcq+ssq*1] 1976 lea srcq, [srcq+ssq*2] 1977 vinserti32x4 ym3, [srcq+ssq*0], 1 1978 pmaddwd ym0, ym12, ym1 ; a0 b0 1979 mova ym1, ym2 1980 pshufb ym4, ym3, ym6 1981 mova ym2, ym10 1982 vpdpwssd ym2, ym8, ym4 1983 pshufb ym3, ym7 1984 vpdpwssd ym0, ym13, ym1 ; a1 b1 1985 vpdpwssd ym2, ym9, ym3 ; 5 6 1986 vpermt2b ym2, ym5, ym1 ; 45 56 1987 vpdpwssd ym0, ym14, ym2 ; a2 b2 1988 psrad ym0, 10 1989 vextracti32x4 xm4, ym0, 1 1990 packusdw xm0, xm4 1991 pminsw xmm0, xm0, xm15 1992 movq [dstq+dsq*0], xmm0 1993 movhps [dstq+dsq*1], xmm0 1994 lea dstq, [dstq+dsq*2] 1995 sub hd, 2 1996 jg .hv_w4_loop 1997 RET 1998.hv_w8: 1999 shr mxd, 16 2000 pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] 2001 movzx mxd, myb 2002 shr myd, 16 2003 cmp hd, 6 2004 cmovs myd, mxd 2005 pmovsxbw xmm1, [base+subpel_filters+1+myq*8] 2006 mov r6, ssq 2007 sub srcq, 4 2008 neg r6 2009 test dword r8m, 0x800 2010 jnz .hv_w8_12bit 2011 vpbroadcastd m8, [pd_2176] 2012 psllw xmm0, 6 2013 jmp .hv_w8_main 2014.hv_w8_12bit: 2015 vpbroadcastd m8, [pd_640] 2016 psllw xmm0, 4 2017 psllw xmm1, 2 2018.hv_w8_main: 2019 mova [buf+ 0], xmm0 2020 mova [buf+16], xmm1 2021 vpbroadcastd m9, xmm0 2022 vpbroadcastd m10, [buf+ 4] 2023 vpbroadcastd m11, [buf+ 8] 2024 vpbroadcastd m12, xmm1 2025 vpbroadcastd m13, [buf+20] 2026 vpbroadcastd m14, [buf+24] 2027 cmp wd, 16 2028 jge .hv_w16 2029 mova m6, [spel_h_shufA] 2030 movu ym16, [srcq+r6 *2] 2031 vinserti32x8 m16, [srcq+r6 *1], 1 ; 0 1 2032 movu ym17, [srcq+ssq*0] 2033 vinserti32x8 m17, [srcq+ssq*1], 1 ; 2 3 2034 lea srcq, [srcq+ssq*2] 2035 movu ym18, [srcq+ssq*0] ; 4 2036 movu m7, [spel_h_shufC] 2037 vpermb m3, m6, m16 2038 mova m1, m8 2039 vpermb m4, m6, m17 2040 vpdpwssd m1, m9, m3 ; a0 b0 2041 mova m2, m8 2042 vpermb m5, m6, m18 2043 vpdpwssd m2, m9, m4 ; c0 d0 2044 mova m0, m8 2045 vpermb m16, m7, m16 2046 vpdpwssd m0, m9, m5 ; e0 2047 vpermb m17, m7, m17 2048 vpdpwssd m1, m11, m16 ; a2 b2 2049 vpermb m18, m7, m18 2050 vpdpwssd m2, m11, m17 ; c2 d2 2051 shufpd m3, m16, 0x55 2052 vpdpwssd m0, m11, m18 ; e2 2053 mova m16, [spel_shuf8a] 2054 shufpd m4, m17, 0x55 2055 vpdpwssd m1, m10, m3 ; a1 b1 2056 shufpd m5, m18, 0x55 2057 vpdpwssd m2, m10, m4 ; c1 d1 2058 vpdpwssd m0, m10, m5 ; e1 2059 mova m5, [spel_shuf8b] 2060 vpermt2b m1, m16, m2 ; 01 12 2061 vpermt2b m2, m16, m0 ; 23 34 2062.hv_w8_loop: 2063 movu ym18, [srcq+ssq*1] 2064 lea srcq, [srcq+ssq*2] 2065 vinserti32x8 m18, [srcq+ssq*0], 1 2066 mova m0, m8 2067 vpermb m17, m6, m18 2068 vpdpwssd m0, m9, m17 ; f0 g0 2069 vpermb m18, m7, m18 2070 pmaddwd m16, m12, m1 ; A0 B0 2071 vpdpwssd m0, m11, m18 ; f2 g2 2072 shufpd m17, m18, 0x55 2073 mova m1, m2 2074 vpdpwssd m16, m13, m2 ; A1 B1 2075 vpdpwssd m0, m10, m17 ; f1 g1 2076 vpermt2b m2, m5, m0 ; 45 56 2077 vpdpwssd m16, m14, m2 ; A2 B2 2078 psrad m16, 10 2079 vextracti32x8 ym17, m16, 1 2080 packusdw ym16, ym17 2081 pminsw ym16, ym15 2082 mova [dstq+dsq*0], xm16 2083 vextracti128 [dstq+dsq*1], ym16, 1 2084 lea dstq, [dstq+dsq*2] 2085 sub hd, 2 2086 jg .hv_w8_loop 2087 vzeroupper 2088 RET 2089.hv_w16: 2090 vbroadcasti32x4 m20, [spel_h_shufA] 2091 vbroadcasti32x4 m21, [spel_h_shufB] 2092 jg .hv_w32 2093 vbroadcasti32x8 m6, [srcq+r6 *2+ 8] 2094 vinserti32x8 m2, m6, [srcq+r6 *2+16], 1 2095 vinserti32x8 m6, [srcq+r6 *2+ 0], 0 ; 0 2096 movu ym16, [srcq+r6 *1+ 0] 2097 movu ym17, [srcq+r6 *1+12] 2098 vinserti32x8 m16, [srcq+ssq*0+ 0], 1 2099 vinserti32x8 m17, [srcq+ssq*0+12], 1 ; 1 2 2100 movu ym18, [srcq+ssq*1+ 0] 2101 movu ym19, [srcq+ssq*1+12] 2102 lea srcq, [srcq+ssq*2] 2103 vinserti32x8 m18, [srcq+ssq*0+ 0], 1 2104 vinserti32x8 m19, [srcq+ssq*0+12], 1 ; 3 4 2105 pshufb m2, m20 2106 mova m1, m8 2107 pshufb m3, m16, m20 2108 vpdpwssd m1, m11, m2 ; a2 2109 mova m2, m8 2110 pshufb m4, m17, m21 2111 vpdpwssd m2, m9, m3 ; b0 c0 2112 mova m3, m8 2113 pshufb m5, m18, m20 2114 vpdpwssd m3, m11, m4 ; b2' c2' 2115 mova m4, m8 2116 pshufb m7, m19, m21 2117 vpdpwssd m4, m9, m5 ; d0 e0 2118 mova m5, m8 2119 pshufb m0, m6, m20 2120 vpdpwssd m5, m11, m7 ; d2' e2' 2121 mova m7, [spel_shuf16] 2122 pshufb m16, m21 2123 vpdpwssd m1, m9, m0 ; a0 2124 pshufb m17, m20 2125 vpdpwssd m2, m10, m16 ; b1 c1 2126 pshufb m18, m21 2127 vpdpwssd m3, m10, m17 ; b1' c1' 2128 pshufb m19, m20 2129 vpdpwssd m4, m10, m18 ; d1 e1 2130 pshufb m6, m21 2131 vpdpwssd m5, m10, m19 ; d1' e1' 2132 shufpd m16, m17, 0x55 2133 vpdpwssd m1, m10, m6 ; a1 2134 shufpd m18, m19, 0x55 2135 vpdpwssd m2, m11, m16 ; b2 c2 2136 vpdpwssd m3, m9, m16 ; b0' c0' 2137 vpdpwssd m4, m11, m18 ; d2 e2 2138 vpdpwssd m5, m9, m18 ; d0' e0' 2139 pslldq m1, 1 2140 vpermt2b m2, m7, m3 ; 12 2141 vpermt2b m4, m7, m5 ; 34 2142 vpshrdd m1, m2, 16 ; 01 2143 vpshrdd m3, m2, m4, 16 ; 23 2144.hv_w16_loop: 2145 movu ym18, [srcq+ssq*1+ 0] 2146 movu ym19, [srcq+ssq*1+12] 2147 lea srcq, [srcq+ssq*2] 2148 vinserti32x8 m18, [srcq+ssq*0+ 0], 1 2149 vinserti32x8 m19, [srcq+ssq*0+12], 1 2150 mova m5, m8 2151 mova m6, m8 2152 pshufb m17, m18, m20 2153 vpdpwssd m5, m9, m17 ; f0 g0 2154 pshufb m16, m19, m21 2155 vpdpwssd m6, m11, m16 ; f2' g2' 2156 pmaddwd m17, m12, m2 ; B0 2157 mova m2, m4 2158 pmaddwd m16, m12, m1 ; A0 2159 mova m1, m3 2160 pshufb m18, m21 2161 vpdpwssd m5, m10, m18 ; f1 g1 2162 pshufb m19, m20 2163 vpdpwssd m6, m10, m19 ; f1' g1' 2164 vpdpwssd m17, m13, m4 ; B1 2165 vpdpwssd m16, m13, m3 ; A1 2166 shufpd m18, m19, 0x55 2167 vpdpwssd m5, m11, m18 ; f2 g2 2168 vpdpwssd m6, m9, m18 ; f0' g0' 2169 mova m4, m7 2170 vpermi2b m4, m5, m6 ; 56 2171 vpshrdd m3, m2, m4, 16 ; 45 2172 vpdpwssd m17, m14, m4 ; B2 2173 vpdpwssd m16, m14, m3 ; A2 2174 psrad m16, 10 2175 psrad m17, 10 2176 vshufi32x4 m18, m16, m17, q3232 2177 vinserti32x8 m16, ym17, 1 2178 packusdw m16, m18 2179 pminsw m16, m15 2180 mova [dstq+dsq*0], ym16 2181 vextracti32x8 [dstq+dsq*1], m16, 1 2182 lea dstq, [dstq+dsq*2] 2183 sub hd, 2 2184 jg .hv_w16_loop 2185 vzeroupper 2186 RET 2187.hv_w32: 2188 WIN64_SPILL_XMM 28 2189 mova m27, [spel_shuf32] 2190 lea wd, [hq+wq*8-256] 2191.hv_w32_loop0: 2192 movu m16, [srcq+r6 *2+ 0] 2193 movu m7, [srcq+r6 *2+12] 2194 movu m6, [srcq+r6 *1+ 0] 2195 movu m18, [srcq+r6 *1+12] 2196 lea r7, [srcq+ssq*2] 2197 movu m17, [srcq+ssq*0+ 0] 2198 movu m19, [srcq+ssq*0+12] 2199 movu m22, [srcq+ssq*1+ 0] 2200 movu m24, [srcq+ssq*1+12] 2201 mov r8, dstq 2202 movu m23, [r7 +ssq*0+ 0] 2203 movu m25, [r7 +ssq*0+12] 2204 pshufb m1, m16, m20 2205 mova m0, m8 2206 pshufb m2, m7, m21 2207 vpdpwssd m0, m9, m1 ; a0 2208 mova m1, m8 2209 pshufb m4, m6, m20 2210 vpdpwssd m1, m11, m2 ; a2' 2211 mova m2, m8 2212 pshufb m3, m17, m20 2213 vpdpwssd m2, m9, m4 ; b0 2214 mova m4, m8 2215 pshufb m5, m18, m21 2216 vpdpwssd m4, m9, m3 ; c0 2217 mova m3, m8 2218 pshufb m26, m19, m21 2219 vpdpwssd m3, m11, m5 ; b2' 2220 mova m5, m8 2221 pshufb m16, m21 2222 vpdpwssd m5, m11, m26 ; c2' 2223 pshufb m7, m20 2224 vpdpwssd m0, m10, m16 ; a1 2225 pshufb m6, m21 2226 vpdpwssd m1, m10, m7 ; a1' 2227 pshufb m17, m21 2228 vpdpwssd m2, m10, m6 ; b1 2229 pshufb m18, m20 2230 vpdpwssd m4, m10, m17 ; c1 2231 pshufb m19, m20 2232 vpdpwssd m3, m10, m18 ; b1' 2233 shufpd m16, m7, 0x55 2234 vpdpwssd m5, m10, m19 ; c1' 2235 shufpd m6, m18, 0x55 2236 vpdpwssd m0, m11, m16 ; a2 2237 shufpd m17, m19, 0x55 2238 vpdpwssd m1, m9, m16 ; a0' 2239 pshufb m16, m22, m20 2240 vpdpwssd m2, m11, m6 ; b2 2241 pshufb m7, m23, m20 2242 vpdpwssd m4, m11, m17 ; c2 2243 vpdpwssd m3, m9, m6 ; b0' 2244 mova m6, m8 2245 vpdpwssd m5, m9, m17 ; c0' 2246 pshufb m17, m24, m21 2247 vpdpwssd m6, m9, m16 ; d0 2248 mova m16, m8 2249 pshufb m26, m25, m21 2250 vpdpwssd m16, m9, m7 ; e0 2251 mova m7, m8 2252 pshufb m22, m21 2253 vpdpwssd m7, m11, m17 ; d2' 2254 mova m17, m8 2255 pshufb m23, m21 2256 vpdpwssd m17, m11, m26 ; e2' 2257 pshufb m24, m20 2258 vpdpwssd m6, m10, m22 ; d1 2259 pshufb m25, m20 2260 vpdpwssd m16, m10, m23 ; e1 2261 shufpd m22, m24, 0x55 2262 vpdpwssd m7, m10, m24 ; d1' 2263 shufpd m23, m25, 0x55 2264 vpdpwssd m17, m10, m25 ; e1' 2265 pslldq m0, 1 2266 vpdpwssd m6, m11, m22 ; d2 2267 pslldq m1, 1 2268 vpdpwssd m16, m11, m23 ; e2 2269 vpermt2b m2, m27, m4 ; 12 2270 vpdpwssd m7, m9, m22 ; d0' 2271 vpermt2b m3, m27, m5 ; 12' 2272 vpdpwssd m17, m9, m23 ; e0' 2273 vpshrdd m0, m2, 16 ; 01 2274 vpermt2b m6, m27, m16 ; 34 2275 vpshrdd m1, m3, 16 ; 01' 2276 vpermt2b m7, m27, m17 ; 34' 2277 vpshrdd m4, m2, m6, 16 ; 23 2278 vpshrdd m5, m3, m7, 16 ; 23' 2279.hv_w32_loop: 2280 movu m22, [r7+ssq*1+ 0] 2281 movu m24, [r7+ssq*1+12] 2282 lea r7, [r7+ssq*2] 2283 movu m23, [r7+ssq*0+ 0] 2284 movu m25, [r7+ssq*0+12] 2285 pmaddwd m17, m12, m2 ; B0 2286 mova m2, m6 2287 pmaddwd m19, m12, m3 ; B0' 2288 mova m3, m7 2289 pmaddwd m16, m12, m0 ; A0 2290 mova m0, m4 2291 pmaddwd m18, m12, m1 ; A0' 2292 mova m1, m5 2293 vpdpwssd m17, m13, m6 ; B1 2294 vpdpwssd m19, m13, m7 ; B1' 2295 mova m6, m8 2296 vpdpwssd m16, m13, m4 ; A1 2297 pshufb m4, m22, m20 2298 vpdpwssd m18, m13, m5 ; A1' 2299 pshufb m7, m23, m20 2300 vpdpwssd m6, m9, m4 ; f0 2301 mova m4, m8 2302 pshufb m5, m24, m21 2303 vpdpwssd m4, m9, m7 ; g0 2304 mova m7, m8 2305 pshufb m26, m25, m21 2306 vpdpwssd m7, m11, m5 ; f2' 2307 mova m5, m8 2308 pshufb m22, m21 2309 vpdpwssd m5, m11, m26 ; g2' 2310 pshufb m23, m21 2311 vpdpwssd m6, m10, m22 ; f1 2312 pshufb m24, m20 2313 vpdpwssd m4, m10, m23 ; g1 2314 pshufb m25, m20 2315 vpdpwssd m7, m10, m24 ; f1' 2316 shufpd m22, m24, 0x55 2317 vpdpwssd m5, m10, m25 ; g1' 2318 shufpd m23, m25, 0x55 2319 vpdpwssd m6, m11, m22 ; f2 2320 vpdpwssd m4, m11, m23 ; g2 2321 vpdpwssd m7, m9, m22 ; f0' 2322 vpdpwssd m5, m9, m23 ; g0' 2323 vpermt2b m6, m27, m4 ; 56 2324 vpermt2b m7, m27, m5 ; 56' 2325 vpdpwssd m17, m14, m6 ; B2 2326 vpshrdd m4, m2, m6, 16 ; 45 2327 vpdpwssd m19, m14, m7 ; B2' 2328 vpshrdd m5, m3, m7, 16 ; 45' 2329 vpdpwssd m16, m14, m4 ; A2 2330 vpdpwssd m18, m14, m5 ; A2' 2331 REPX {psrad x, 10}, m17, m19, m16, m18 2332 packusdw m17, m19 2333 packusdw m16, m18 2334 pminsw m17, m15 2335 pminsw m16, m15 2336 mova [r8+dsq*0], m16 2337 mova [r8+dsq*1], m17 2338 lea r8, [r8+dsq*2] 2339 sub hd, 2 2340 jg .hv_w32_loop 2341 add srcq, 64 2342 add dstq, 64 2343 movzx hd, wb 2344 sub wd, 1<<8 2345 jg .hv_w32_loop0 2346 RET 2347 2348PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_16bpc 2349PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_16bpc 2350PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_16bpc 2351PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_16bpc 2352PUT_8TAP_FN sharp, SHARP, SHARP 2353 2354cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my 2355 imul mxd, mxm, 0x010101 2356 add mxd, t0d ; 8tap_h, mx, 4tap_h 2357 imul myd, mym, 0x010101 2358 add myd, t1d ; 8tap_v, my, 4tap_v 2359 lea r8, [put_avx512icl] 2360 movifnidn wd, wm 2361 movifnidn hd, hm 2362 test mxd, 0xf00 2363 jnz .h 2364 test myd, 0xf00 2365 jz mangle(private_prefix %+ _put_6tap_16bpc_avx512icl).put 2366.v: 2367 movzx mxd, myb 2368 shr myd, 16 2369 cmp hd, 6 2370 cmovs myd, mxd 2371 vpbroadcastd m10, [pd_32] 2372 pmovsxbw xmm0, [base+subpel_filters+myq*8] 2373 tzcnt r7d, wd 2374 vpbroadcastw m11, r8m 2375 lea r6, [ssq*3] 2376 movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)] 2377 sub srcq, r6 2378 mova [rsp+stack_offset+8], xmm0 2379 vpbroadcastd m12, xmm0 2380 add r7, r8 2381 vpbroadcastd m13, [rsp+stack_offset+12] 2382 vpbroadcastd m14, [rsp+stack_offset+16] 2383 vpbroadcastd m15, [rsp+stack_offset+20] 2384 jmp r7 2385.v_w2: 2386 movd xmm2, [srcq+ssq*0] 2387 pinsrd xmm2, [srcq+ssq*1], 1 2388 pinsrd xmm2, [srcq+ssq*2], 2 2389 add srcq, r6 2390 pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 2391 movd xmm3, [srcq+ssq*1] 2392 vpbroadcastd xmm1, [srcq+ssq*2] 2393 add srcq, r6 2394 vpbroadcastd xmm0, [srcq+ssq*0] 2395 vpblendd xmm3, xmm1, 0x02 ; 4 5 2396 vpblendd xmm1, xmm0, 0x02 ; 5 6 2397 palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 2398 punpcklwd xmm3, xmm1 ; 45 56 2399 punpcklwd xmm1, xmm2, xmm4 ; 01 12 2400 punpckhwd xmm2, xmm4 ; 23 34 2401.v_w2_loop: 2402 vpbroadcastd xmm4, [srcq+ssq*1] 2403 lea srcq, [srcq+ssq*2] 2404 mova xmm5, xm10 2405 vpdpwssd xmm5, xm12, xmm1 ; a0 b0 2406 mova xmm1, xmm2 2407 vpdpwssd xmm5, xm13, xmm2 ; a1 b1 2408 mova xmm2, xmm3 2409 vpdpwssd xmm5, xm14, xmm3 ; a2 b2 2410 vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 2411 vpbroadcastd xmm0, [srcq+ssq*0] 2412 vpblendd xmm4, xmm0, 0x02 ; 7 8 2413 punpcklwd xmm3, xmm4 ; 67 78 2414 vpdpwssd xmm5, xm15, xmm3 ; a3 b3 2415 psrad xmm5, 6 2416 packusdw xmm5, xmm5 2417 pminsw xmm5, xm11 2418 movd [dstq+dsq*0], xmm5 2419 pextrd [dstq+dsq*1], xmm5, 1 2420 lea dstq, [dstq+dsq*2] 2421 sub hd, 2 2422 jg .v_w2_loop 2423 RET 2424.v_w4: 2425 movq xmm1, [srcq+ssq*0] 2426 vpbroadcastq ymm0, [srcq+ssq*1] 2427 vpbroadcastq ymm2, [srcq+ssq*2] 2428 add srcq, r6 2429 vpbroadcastq ymm4, [srcq+ssq*0] 2430 vpbroadcastq ymm3, [srcq+ssq*1] 2431 vpbroadcastq ymm5, [srcq+ssq*2] 2432 add srcq, r6 2433 vpblendd ymm1, ymm0, 0x30 2434 vpblendd ymm0, ymm2, 0x30 2435 punpcklwd ymm1, ymm0 ; 01 12 2436 vpbroadcastq ymm0, [srcq+ssq*0] 2437 vpblendd ymm2, ymm4, 0x30 2438 vpblendd ymm4, ymm3, 0x30 2439 punpcklwd ymm2, ymm4 ; 23 34 2440 vpblendd ymm3, ymm5, 0x30 2441 vpblendd ymm5, ymm0, 0x30 2442 punpcklwd ymm3, ymm5 ; 45 56 2443.v_w4_loop: 2444 vpbroadcastq ymm5, [srcq+ssq*1] 2445 lea srcq, [srcq+ssq*2] 2446 mova ymm4, ym10 2447 vpdpwssd ymm4, ym12, ymm1 ; a0 b0 2448 mova ymm1, ymm2 2449 vpdpwssd ymm4, ym13, ymm2 ; a1 b1 2450 mova ymm2, ymm3 2451 vpdpwssd ymm4, ym14, ymm3 ; a2 b2 2452 vpblendd ymm3, ymm0, ymm5, 0x30 2453 vpbroadcastq ymm0, [srcq+ssq*0] 2454 vpblendd ymm5, ymm0, 0x30 2455 punpcklwd ymm3, ymm5 ; 67 78 2456 vpdpwssd ymm4, ym15, ymm3 ; a3 b3 2457 psrad ymm4, 6 2458 vextracti128 xmm5, ymm4, 1 2459 packusdw xmm4, xmm5 2460 pminsw xmm4, xm11 2461 movq [dstq+dsq*0], xmm4 2462 movhps [dstq+dsq*1], xmm4 2463 lea dstq, [dstq+dsq*2] 2464 sub hd, 2 2465 jg .v_w4_loop 2466 vzeroupper 2467 RET 2468.v_w8: 2469 vbroadcasti32x4 m2, [srcq+ssq*2] 2470 vinserti32x4 m1, m2, [srcq+ssq*0], 0 2471 vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2 2472 add srcq, r6 2473 vinserti32x4 ym2, [srcq+ssq*0], 1 2474 vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4 2475 mova m6, [spel_v_shuf8] 2476 movu xm0, [srcq+ssq*1] 2477 vinserti32x4 ym0, [srcq+ssq*2], 1 2478 add srcq, r6 2479 vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 2480 vpermb m1, m6, m1 ; 01 12 2481 vpermb m2, m6, m2 ; 23 34 2482 vpermb m3, m6, m0 ; 45 56 2483.v_w8_loop: 2484 vinserti32x4 m0, [srcq+ssq*1], 3 2485 lea srcq, [srcq+ssq*2] 2486 movu xm5, [srcq+ssq*0] 2487 mova m4, m10 2488 vpdpwssd m4, m12, m1 ; a0 b0 2489 mova m1, m2 2490 vshufi32x4 m0, m5, q1032 ; 6 7 8 2491 vpdpwssd m4, m13, m2 ; a1 b1 2492 mova m2, m3 2493 vpdpwssd m4, m14, m3 ; a2 b2 2494 vpermb m3, m6, m0 ; 67 78 2495 vpdpwssd m4, m15, m3 ; a3 b3 2496 psrad m4, 6 2497 vextracti32x8 ym5, m4, 1 2498 packusdw ym4, ym5 2499 pminsw ym4, ym11 2500 mova [dstq+dsq*0], xm4 2501 vextracti32x4 [dstq+dsq*1], ym4, 1 2502 lea dstq, [dstq+dsq*2] 2503 sub hd, 2 2504 jg .v_w8_loop 2505 RET 2506.v_w16: 2507 vbroadcasti32x8 m0, [srcq+ssq*1] 2508 vinserti32x8 m1, m0, [srcq+ssq*2], 1 2509 vinserti32x8 m0, [srcq+ssq*0], 0 2510 mova m8, [spel_v_shuf16] 2511 add srcq, r6 2512 movu ym3, [srcq+ssq*0] 2513 vinserti32x8 m3, [srcq+ssq*1], 1 2514 movu ym5, [srcq+ssq*2] 2515 add srcq, r6 2516 vinserti32x8 m5, [srcq+ssq*0], 1 2517 vpermb m1, m8, m1 ; 12 2518 vpermb m0, m8, m0 ; 01 2519 vpermb m3, m8, m3 ; 34 2520 vpermb m5, m8, m5 ; 56 2521 mova m9, [deint_q_shuf] 2522 vpshrdd m2, m1, m3, 16 ; 23 2523 vpshrdd m4, m3, m5, 16 ; 45 2524.v_w16_loop: 2525 mova m7, m10 2526 vpdpwssd m7, m12, m1 ; b0 2527 mova m6, m10 2528 vpdpwssd m6, m12, m0 ; a0 2529 mova m1, m3 2530 vpdpwssd m7, m13, m3 ; b1 2531 mova m0, m2 2532 vpdpwssd m6, m13, m2 ; a1 2533 mova m3, m5 2534 vpdpwssd m7, m14, m5 ; b2 2535 mova m2, m4 2536 vpdpwssd m6, m14, m4 ; a2 2537 movu ym5, [srcq+ssq*1] 2538 lea srcq, [srcq+ssq*2] 2539 vinserti32x8 m5, [srcq+ssq*0], 1 2540 vpermb m5, m8, m5 ; 78 2541 vpshrdd m4, m3, m5, 16 ; 67 2542 vpdpwssd m7, m15, m5 ; b3 2543 vpdpwssd m6, m15, m4 ; a3 2544 psrad m7, 6 2545 psrad m6, 6 2546 packusdw m6, m7 2547 pminsw m6, m11 2548 vpermq m6, m9, m6 2549 mova [dstq+dsq*0], ym6 2550 vextracti32x8 [dstq+dsq*1], m6, 1 2551 lea dstq, [dstq+dsq*2] 2552 sub hd, 2 2553 jg .v_w16_loop 2554 RET 2555.v_w32: 2556.v_w64: 2557.v_w128: 2558 WIN64_SPILL_XMM 23 2559 lea wd, [hq+wq*8-256] 2560.v_w32_loop0: 2561 movu m16, [srcq+ssq*0] 2562 movu m17, [srcq+ssq*1] 2563 lea r7, [srcq+r6 ] 2564 movu m18, [srcq+ssq*2] 2565 movu m19, [r7 +ssq*0] 2566 mov r8, dstq 2567 movu m20, [r7 +ssq*1] 2568 movu m21, [r7 +ssq*2] 2569 add r7, r6 2570 movu m22, [r7 +ssq*0] 2571 punpcklwd m0, m16, m17 ; 01l 2572 punpckhwd m16, m17 ; 01h 2573 punpcklwd m1, m17, m18 ; 12l 2574 punpckhwd m17, m18 ; 12h 2575 punpcklwd m2, m18, m19 ; 23l 2576 punpckhwd m18, m19 ; 23h 2577 punpcklwd m3, m19, m20 ; 34l 2578 punpckhwd m19, m20 ; 34h 2579 punpcklwd m4, m20, m21 ; 45l 2580 punpckhwd m20, m21 ; 45h 2581 punpcklwd m5, m21, m22 ; 56l 2582 punpckhwd m21, m22 ; 56h 2583.v_w32_loop: 2584 mova m6, m10 2585 vpdpwssd m6, m12, m0 ; a0l 2586 mova m8, m10 2587 vpdpwssd m8, m12, m16 ; a0h 2588 mova m7, m10 2589 vpdpwssd m7, m12, m1 ; b0l 2590 mova m9, m10 2591 vpdpwssd m9, m12, m17 ; b0h 2592 mova m0, m2 2593 vpdpwssd m6, m13, m2 ; a1l 2594 mova m16, m18 2595 vpdpwssd m8, m13, m18 ; a1h 2596 mova m1, m3 2597 vpdpwssd m7, m13, m3 ; b1l 2598 mova m17, m19 2599 vpdpwssd m9, m13, m19 ; b1h 2600 mova m2, m4 2601 vpdpwssd m6, m14, m4 ; a2l 2602 mova m18, m20 2603 vpdpwssd m8, m14, m20 ; a2h 2604 mova m3, m5 2605 vpdpwssd m7, m14, m5 ; b2l 2606 mova m19, m21 2607 vpdpwssd m9, m14, m21 ; b2h 2608 movu m21, [r7+ssq*1] 2609 lea r7, [r7+ssq*2] 2610 punpcklwd m4, m22, m21 ; 67l 2611 punpckhwd m20, m22, m21 ; 67h 2612 movu m22, [r7+ssq*0] 2613 vpdpwssd m6, m15, m4 ; a3l 2614 vpdpwssd m8, m15, m20 ; a3h 2615 punpcklwd m5, m21, m22 ; 78l 2616 punpckhwd m21, m22 ; 78h 2617 vpdpwssd m7, m15, m5 ; b3l 2618 vpdpwssd m9, m15, m21 ; b3h 2619 REPX {psrad x, 6}, m6, m8, m7, m9 2620 packusdw m6, m8 2621 packusdw m7, m9 2622 pminsw m6, m11 2623 pminsw m7, m11 2624 mova [r8+dsq*0], m6 2625 mova [r8+dsq*1], m7 2626 lea r8, [r8+dsq*2] 2627 sub hd, 2 2628 jg .v_w32_loop 2629 add srcq, 64 2630 add dstq, 64 2631 movzx hd, wb 2632 sub wd, 1<<8 2633 jg .v_w32_loop0 2634 RET 2635.h_w2: 2636 RESET_STACK_STATE 2637 mova ym2, [spel_h_shuf2a] 2638 sub srcq, 2 2639 pshufd xmm3, xmm0, q1111 2640 pshufd xmm4, xmm0, q2222 2641.h_w2_loop: 2642 movu xm1, [srcq+ssq*0] 2643 vinserti32x4 ym1, [srcq+ssq*1], 1 2644 lea srcq, [srcq+ssq*2] 2645 mova xmm0, xm8 2646 vpermb ym1, ym2, ym1 2647 vpdpwssd xmm0, xmm3, xm1 2648 vextracti32x4 xm1, ym1, 1 2649 vpdpwssd xmm0, xmm4, xm1 2650 psrad xmm0, 6 2651 packusdw xmm0, xmm0 2652 pminsw xmm0, xm15 2653 movd [dstq+dsq*0], xmm0 2654 pextrd [dstq+dsq*1], xmm0, 1 2655 lea dstq, [dstq+dsq*2] 2656 sub hd, 2 2657 jg .h_w2_loop 2658 RET 2659.h_w4: 2660 movzx mxd, mxb 2661 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 2662 jl .h_w2 2663 vbroadcasti32x4 ym4, [spel_h_shufA] 2664 vbroadcasti32x4 ym5, [spel_h_shufB] 2665 sub srcq, 2 2666 pshufd xmm0, xmm0, q2211 2667 vpbroadcastq ym6, xmm0 2668 vpermq ym7, ymm0, q1111 2669.h_w4_loop: 2670 movu xm2, [srcq+ssq*0] 2671 vinserti32x4 ym2, [srcq+ssq*1], 1 2672 lea srcq, [srcq+ssq*2] 2673 mova ym0, ym8 2674 pshufb ym1, ym2, ym4 2675 vpdpwssd ym0, ym6, ym1 2676 pshufb ym2, ym5 2677 vpdpwssd ym0, ym7, ym2 2678 psrad ym0, 6 2679 vextracti32x4 xm1, ym0, 1 2680 packusdw xm0, xm1 2681 pminsw xmm0, xm0, xm15 2682 movq [dstq+dsq*0], xmm0 2683 movhps [dstq+dsq*1], xmm0 2684 lea dstq, [dstq+dsq*2] 2685 sub hd, 2 2686 jg .h_w4_loop 2687 RET 2688.h_w8: 2689 mova m4, [spel_h_shufA] 2690 movu m5, [spel_h_shufB] 2691 movu m6, [spel_h_shufC] 2692 mova m7, [spel_h_shufD] 2693.h_w8_loop: 2694 movu ym2, [srcq+ssq*0] 2695 vinserti32x8 m2, [srcq+ssq*1], 1 2696 lea srcq, [srcq+ssq*2] 2697 mova m0, m8 2698 vpermb m1, m4, m2 2699 vpdpwssd m0, m10, m1 2700 vpermb m1, m5, m2 2701 vpdpwssd m0, m11, m1 2702 vpermb m1, m6, m2 2703 vpdpwssd m0, m12, m1 2704 vpermb m1, m7, m2 2705 vpdpwssd m0, m13, m1 2706 psrad m0, 6 2707 vextracti32x8 ym1, m0, 1 2708 packusdw ym0, ym1 2709 pminsw ym0, ym15 2710 mova [dstq+dsq*0], xm0 2711 vextracti32x4 [dstq+dsq*1], ym0, 1 2712 lea dstq, [dstq+dsq*2] 2713 sub hd, 2 2714 jg .h_w8_loop 2715 RET 2716.h: 2717 vpbroadcastw m15, r8m 2718 test myd, 0xf00 2719 jnz .hv 2720 mov r7d, r8m 2721 shr r7d, 11 2722 vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] 2723 cmp wd, 4 2724 jle .h_w4 2725 shr mxd, 16 2726 sub srcq, 6 2727 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 2728 mova [buf], xmm0 2729 vpbroadcastd m10, xmm0 2730 vpbroadcastd m11, [buf+ 4] 2731 vpbroadcastd m12, [buf+ 8] 2732 vpbroadcastd m13, [buf+12] 2733 sub wd, 16 2734 jl .h_w8 2735 vbroadcasti32x4 m6, [spel_h_shufA] 2736 vbroadcasti32x4 m7, [spel_h_shufB] 2737 jg .h_w32 2738.h_w16_loop: 2739 movu ym2, [srcq+ssq*0+ 0] 2740 vinserti32x8 m2, [srcq+ssq*1+ 0], 1 2741 movu ym3, [srcq+ssq*0+16] 2742 vinserti32x8 m3, [srcq+ssq*1+16], 1 2743 lea srcq, [srcq+ssq*2] 2744 mova m0, m8 2745 mova m1, m8 2746 pshufb m4, m2, m6 2747 vpdpwssd m0, m10, m4 ; a0 2748 pshufb m4, m3, m6 2749 vpdpwssd m1, m12, m4 ; b2 2750 pshufb m4, m2, m7 2751 vpdpwssd m0, m11, m4 ; a1 2752 pshufb m4, m3, m7 2753 vpdpwssd m1, m13, m4 ; b3 2754 shufpd m2, m3, 0x55 2755 pshufb m4, m2, m6 2756 vpdpwssd m0, m12, m4 ; a2 2757 vpdpwssd m1, m10, m4 ; b0 2758 pshufb m2, m7 2759 vpdpwssd m0, m13, m2 ; a3 2760 vpdpwssd m1, m11, m2 ; b1 2761 psrad m0, 6 2762 psrad m1, 6 2763 packusdw m0, m1 2764 pminsw m0, m15 2765 mova [dstq+dsq*0], ym0 2766 vextracti32x8 [dstq+dsq*1], m0, 1 2767 lea dstq, [dstq+dsq*2] 2768 sub hd, 2 2769 jg .h_w16_loop 2770 RET 2771.h_w32: 2772 lea srcq, [srcq+wq*2] 2773 lea dstq, [dstq+wq*2] 2774 neg wq 2775.h_w32_loop0: 2776 mov r6, wq 2777.h_w32_loop: 2778 movu m2, [srcq+r6*2+ 0] 2779 movu m3, [srcq+r6*2+ 8] 2780 mova m0, m8 2781 mova m1, m8 2782 pshufb m4, m2, m6 2783 vpdpwssd m0, m10, m4 ; a0 2784 pshufb m4, m3, m6 2785 vpdpwssd m1, m10, m4 ; b0 2786 vpdpwssd m0, m12, m4 ; a2 2787 movu m4, [srcq+r6*2+16] 2788 pshufb m3, m7 2789 vpdpwssd m1, m11, m3 ; b1 2790 vpdpwssd m0, m13, m3 ; a3 2791 pshufb m3, m4, m6 2792 vpdpwssd m1, m12, m3 ; b2 2793 pshufb m2, m7 2794 vpdpwssd m0, m11, m2 ; a1 2795 pshufb m4, m7 2796 vpdpwssd m1, m13, m4 ; b3 2797 psrad m0, 6 2798 psrad m1, 6 2799 packusdw m0, m1 2800 pminsw m0, m15 2801 mova [dstq+r6*2], m0 2802 add r6, 32 2803 jl .h_w32_loop 2804 add srcq, ssq 2805 add dstq, dsq 2806 dec hd 2807 jg .h_w32_loop0 2808 RET 2809.hv: 2810 cmp wd, 4 2811 jg .hv_w8 2812 movzx mxd, mxb 2813 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 2814 movzx mxd, myb 2815 shr myd, 16 2816 cmp hd, 6 2817 cmovs myd, mxd 2818 pmovsxbw xmm1, [base+subpel_filters+myq*8] 2819 lea r6, [ssq*3] 2820 sub srcq, 2 2821 sub srcq, r6 2822 test dword r8m, 0x800 2823 jnz .hv_12bit 2824 vpbroadcastd m10, [pd_2176] 2825 psllw xmm0, 6 2826 jmp .hv_main 2827.hv_12bit: 2828 vpbroadcastd m10, [pd_640] 2829 psllw xmm0, 4 2830 psllw xmm1, 2 2831.hv_main: 2832 mova [buf+ 0], xmm0 2833 mova [buf+16], xmm1 2834 vpbroadcastd m8, [buf+ 4] 2835 vpbroadcastd m9, [buf+ 8] 2836 vpbroadcastd ym11, xmm1 2837 vpbroadcastd ym12, [buf+20] 2838 vpbroadcastd ym13, [buf+24] 2839 vpbroadcastd ym14, [buf+28] 2840 movu xm4, [srcq+ssq*0] 2841 vinserti32x4 ym4, [srcq+ssq*1], 1 2842 vinserti32x4 m4, [srcq+ssq*2], 2 2843 add srcq, r6 2844 vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3 2845 movu xm0, [srcq+ssq*1] 2846 vinserti32x4 ym0, [srcq+ssq*2], 1 2847 add srcq, r6 2848 vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 2849 cmp wd, 4 2850 je .hv_w4 2851 vbroadcasti32x4 m2, [spel_h_shufA] 2852 mova m3, [spel_h_shuf2b] 2853 mova ym6, [spel_h_shuf2a] 2854 mova xm7, [spel_shuf2] 2855 mova m1, m10 2856 pshufb m4, m2 2857 pshufb m0, m2 2858 punpcklqdq m2, m4, m0 2859 vpdpwssd m1, m8, m2 ; 04 15 26 3_ 2860 punpckhqdq m4, m0 2861 vpdpwssd m1, m9, m4 2862 vpermb m1, m3, m1 ; 01 12 2863 vextracti32x4 xm2, ym1, 1 ; 23 34 2864 vextracti32x4 xm3, m1, 2 ; 45 56 2865.hv_w2_loop: 2866 movu xm5, [srcq+ssq*1] 2867 lea srcq, [srcq+ssq*2] 2868 vinserti32x4 ym5, [srcq+ssq*0], 1 2869 mova xm4, xm10 2870 vpermb ym5, ym6, ym5 2871 pmaddwd xmm0, xm11, xm1 ; a0 b0 2872 vpdpwssd xm4, xm8, xm5 2873 vextracti32x4 xm5, ym5, 1 2874 mova xm1, xm2 2875 vpdpwssd xmm0, xm12, xm2 ; a1 b1 2876 vpdpwssd xm4, xm9, xm5 ; 7 8 2877 mova xm2, xm3 2878 vpdpwssd xmm0, xm13, xm3 ; a2 b2 2879 vpermt2b xm3, xm7, xm4 ; 67 78 2880 vpdpwssd xmm0, xm14, xm3 ; a3 b3 2881 psrad xmm0, 10 2882 packusdw xmm0, xmm0 2883 pminsw xmm0, xm15 2884 movd [dstq+dsq*0], xmm0 2885 pextrd [dstq+dsq*1], xmm0, 1 2886 lea dstq, [dstq+dsq*2] 2887 sub hd, 2 2888 jg .hv_w2_loop 2889 RET 2890.hv_w4: 2891 vbroadcasti32x4 m19, [spel_h_shufA] 2892 vbroadcasti32x4 m20, [spel_h_shufB] 2893 mova ym6, [spel_shuf4a] 2894 mova ym7, [spel_shuf4b] 2895 mova m2, m10 2896 mova m3, m10 2897 pshufb m1, m4, m19 2898 vpdpwssd m2, m8, m1 2899 pshufb m1, m0, m19 2900 vpdpwssd m3, m8, m1 2901 pshufb m4, m20 2902 vpdpwssd m2, m9, m4 2903 pshufb m0, m20 2904 vpdpwssd m3, m9, m0 2905 vpermb m1, m6, m2 ; 01 12 2906 vshufi32x4 m2, m3, q1032 2907 vpermb m3, m6, m3 ; 45 56 2908 vpermb m2, m6, m2 ; 23 34 2909.hv_w4_loop: 2910 movu xm18, [srcq+ssq*1] 2911 lea srcq, [srcq+ssq*2] 2912 vinserti128 ym18, [srcq+ssq*0], 1 2913 pmaddwd ym16, ym11, ym1 ; a0 b0 2914 mova ym1, ym2 2915 mova ym2, ym3 2916 pshufb ym17, ym18, ym19 2917 mova ym3, ym10 2918 vpdpwssd ym3, ym8, ym17 2919 pshufb ym18, ym20 2920 vpdpwssd ym16, ym12, ym1 ; a1 b1 2921 vpdpwssd ym3, ym9, ym18 ; 7 8 2922 vpdpwssd ym16, ym13, ym2 ; a2 b2 2923 vpermt2b ym3, ym7, ym2 ; 67 78 2924 vpdpwssd ym16, ym14, ym3 ; a3 b3 2925 psrad ym16, 10 2926 vextracti128 xm17, ym16, 1 2927 packusdw xm16, xm17 2928 pminsw xm16, xm15 2929 movq [dstq+dsq*0], xm16 2930 movhps [dstq+dsq*1], xm16 2931 lea dstq, [dstq+dsq*2] 2932 sub hd, 2 2933 jg .hv_w4_loop 2934 vzeroupper 2935 RET 2936.hv_w8: 2937 shr mxd, 16 2938 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 2939 movzx mxd, myb 2940 shr myd, 16 2941 cmp hd, 6 2942 cmovs myd, mxd 2943 pmovsxbw xmm1, [base+subpel_filters+myq*8] 2944 lea r6, [ssq*3] 2945 sub srcq, 6 2946 sub srcq, r6 2947 test dword r8m, 0x800 2948 jnz .hv_w8_12bit 2949 vpbroadcastd m10, [pd_2176] 2950 psllw xmm0, 6 2951 jmp .hv_w8_main 2952.hv_w8_12bit: 2953 vpbroadcastd m10, [pd_640] 2954 psllw xmm0, 4 2955 psllw xmm1, 2 2956.hv_w8_main: 2957 mova [buf+ 0], xmm0 2958 mova [buf+16], xmm1 2959 vpbroadcastd m11, xmm0 2960 vpbroadcastd m12, [buf+ 4] 2961 vpbroadcastd m13, [buf+ 8] 2962 vpbroadcastd m14, [buf+12] 2963 vpbroadcastd m16, xmm1 2964 vpbroadcastd m17, [buf+20] 2965 vpbroadcastd m18, [buf+24] 2966 vpbroadcastd m19, [buf+28] 2967 cmp wd, 8 2968 jg .hv_w16 2969 mova m5, [spel_h_shufA] 2970 movu ym0, [srcq+ssq*0] 2971 vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1 2972 movu ym9, [srcq+ssq*2] 2973 add srcq, r6 2974 vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3 2975 movu ym20, [srcq+ssq*1] 2976 vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5 2977 add srcq, r6 2978 movu ym21, [srcq+ssq*0] ; 6 2979 movu m6, [spel_h_shufB] 2980 movu m7, [spel_h_shufC] 2981 vpermb m8, m5, m0 2982 mova m1, m10 2983 vpdpwssd m1, m11, m8 ; a0 b0 2984 vpermb m8, m5, m9 2985 mova m2, m10 2986 vpdpwssd m2, m11, m8 ; c0 d0 2987 vpermb m8, m5, m20 2988 mova m3, m10 2989 vpdpwssd m3, m11, m8 ; e0 f0 2990 vpermb m8, m5, m21 2991 mova m4, m10 2992 vpdpwssd m4, m11, m8 ; g0 2993 vpermb m8, m6, m0 2994 vpdpwssd m1, m12, m8 ; a1 b1 2995 vpermb m8, m6, m9 2996 vpdpwssd m2, m12, m8 ; c1 d1 2997 vpermb m8, m6, m20 2998 vpdpwssd m3, m12, m8 ; e1 f1 2999 vpermb m8, m6, m21 3000 vpdpwssd m4, m12, m8 ; g1 3001 vpermb m8, m7, m0 3002 vpdpwssd m1, m13, m8 ; a2 b2 3003 vpermb m8, m7, m9 3004 vpdpwssd m2, m13, m8 ; c2 d2 3005 vpermb m8, m7, m20 3006 vpdpwssd m3, m13, m8 ; e2 f2 3007 vpermb m8, m7, m21 3008 vpdpwssd m4, m13, m8 ; g2 3009 mova m8, [spel_h_shufD] 3010 vpermb m0, m8, m0 3011 vpdpwssd m1, m14, m0 ; a3 b3 3012 mova m0, [spel_shuf8a] 3013 vpermb m9, m8, m9 3014 vpdpwssd m2, m14, m9 ; c3 d3 3015 mova m9, [spel_shuf8b] 3016 vpermb m20, m8, m20 3017 vpdpwssd m3, m14, m20 ; e3 f3 3018 vpermb m21, m8, m21 3019 vpdpwssd m4, m14, m21 ; g3 3020 vpermt2b m1, m0, m2 ; 01 12 3021 vpermt2b m2, m0, m3 ; 23 34 3022 vpermt2b m3, m0, m4 ; 45 56 3023.hv_w8_loop: 3024 movu ym0, [srcq+ssq*1] 3025 lea srcq, [srcq+ssq*2] 3026 vinserti32x8 m0, [srcq+ssq*0], 1 3027 mova m4, m10 3028 vpermb m21, m5, m0 3029 vpdpwssd m4, m11, m21 ; h0 i0 3030 vpermb m21, m6, m0 3031 pmaddwd m20, m16, m1 ; A0 B0 3032 vpdpwssd m4, m12, m21 ; h1 i1 3033 vpermb m21, m7, m0 3034 mova m1, m2 3035 vpdpwssd m20, m17, m2 ; A1 B1 3036 vpdpwssd m4, m13, m21 ; h2 i2 3037 vpermb m21, m8, m0 3038 mova m2, m3 3039 vpdpwssd m20, m18, m3 ; A2 B2 3040 vpdpwssd m4, m14, m21 ; h3 i3 3041 vpermt2b m3, m9, m4 ; 67 78 3042 vpdpwssd m20, m19, m3 ; A3 B3 3043 psrad m20, 10 3044 vextracti32x8 ym21, m20, 1 3045 packusdw ym20, ym21 3046 pminsw ym20, ym15 3047 mova [dstq+dsq*0], xm20 3048 vextracti128 [dstq+dsq*1], ym20, 1 3049 lea dstq, [dstq+dsq*2] 3050 sub hd, 2 3051 jg .hv_w8_loop 3052 vzeroupper 3053 RET 3054.hv_w16: 3055 WIN64_SPILL_XMM 26 3056 vbroadcasti32x4 m20, [spel_h_shufA] 3057 vbroadcasti32x4 m21, [spel_h_shufB] 3058 add wd, wd 3059 mova m9, [spel_shuf16] 3060 lea wd, [hq+wq*8-256] 3061.hv_w16_loop0: 3062 vbroadcasti32x8 m5, [srcq+ssq*0+ 8] 3063 vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0 3064 vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0 3065 movu ym6, [srcq+ssq*1+ 0] 3066 movu ym7, [srcq+ssq*1+16] 3067 lea r7, [srcq+r6] 3068 vinserti32x8 m6, [srcq+ssq*2+ 0], 1 3069 vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2 3070 movu ym22, [r7 +ssq*0+ 0] 3071 movu ym23, [r7 +ssq*0+16] 3072 mov r8, dstq 3073 vinserti32x8 m22, [r7 +ssq*1+ 0], 1 3074 vinserti32x8 m23, [r7 +ssq*1+16], 1 ; 3 4 3075 movu ym24, [r7 +ssq*2+ 0] 3076 movu ym25, [r7 +ssq*2+16] 3077 add r7, r6 3078 vinserti32x8 m24, [r7 +ssq*0+ 0], 1 3079 vinserti32x8 m25, [r7 +ssq*0+16], 1 ; 5 6 3080 pshufb m0, m4, m20 3081 mova m1, m10 3082 vpdpwssd m1, m11, m0 ; a0 3083 pshufb m0, m6, m20 3084 mova m2, m10 3085 vpdpwssd m2, m11, m0 ; b0 3086 pshufb m0, m7, m20 3087 mova m3, m10 3088 vpdpwssd m3, m13, m0 ; c2 3089 pshufb m0, m4, m21 3090 vpdpwssd m1, m12, m0 ; a1 3091 pshufb m0, m6, m21 3092 vpdpwssd m2, m12, m0 ; b1 3093 pshufb m0, m7, m21 3094 vpdpwssd m3, m14, m0 ; c3 3095 pshufb m0, m5, m20 3096 vpdpwssd m1, m13, m0 ; a2 3097 shufpd m6, m7, 0x55 3098 pshufb m7, m6, m20 3099 vpdpwssd m2, m13, m7 ; b2 3100 vpdpwssd m3, m11, m7 ; c0 3101 pshufb m5, m21 3102 vpdpwssd m1, m14, m5 ; a3 3103 pshufb m6, m21 3104 vpdpwssd m2, m14, m6 ; b3 3105 vpdpwssd m3, m12, m6 ; c1 3106 pshufb m0, m22, m20 3107 mova m4, m10 3108 vpdpwssd m4, m11, m0 ; d0 3109 pshufb m0, m23, m20 3110 mova m5, m10 3111 vpdpwssd m5, m13, m0 ; e2 3112 pshufb m0, m24, m20 3113 mova m6, m10 3114 vpdpwssd m6, m11, m0 ; f0 3115 pshufb m0, m25, m20 3116 mova m7, m10 3117 vpdpwssd m7, m13, m0 ; g2 3118 pshufb m0, m22, m21 3119 vpdpwssd m4, m12, m0 ; d1 3120 pshufb m0, m23, m21 3121 vpdpwssd m5, m14, m0 ; e3 3122 pshufb m0, m24, m21 3123 vpdpwssd m6, m12, m0 ; f1 3124 pshufb m0, m25, m21 3125 vpdpwssd m7, m14, m0 ; g3 3126 shufpd m22, m23, 0x55 3127 pshufb m23, m22, m20 3128 vpdpwssd m4, m13, m23 ; d2 3129 vpdpwssd m5, m11, m23 ; e0 3130 shufpd m24, m25, 0x55 3131 pshufb m25, m24, m20 3132 vpdpwssd m6, m13, m25 ; f2 3133 vpdpwssd m7, m11, m25 ; g0 3134 pshufb m22, m21 3135 vpdpwssd m4, m14, m22 ; d3 3136 vpdpwssd m5, m12, m22 ; e1 3137 pshufb m24, m21 3138 vpdpwssd m6, m14, m24 ; f3 3139 vpdpwssd m7, m12, m24 ; g1 3140 pslldq m1, 1 3141 vpermt2b m2, m9, m3 ; 12 3142 vpermt2b m4, m9, m5 ; 34 3143 vpermt2b m6, m9, m7 ; 56 3144 vpshrdd m1, m2, 16 ; 01 3145 vpshrdd m3, m2, m4, 16 ; 23 3146 vpshrdd m5, m4, m6, 16 ; 45 3147.hv_w16_loop: 3148 movu ym24, [r7+ssq*1+ 0] 3149 movu ym25, [r7+ssq*1+16] 3150 lea r7, [r7+ssq*2] 3151 vinserti32x8 m24, [r7+ssq*0+ 0], 1 3152 vinserti32x8 m25, [r7+ssq*0+16], 1 3153 mova m7, m10 3154 mova m8, m10 3155 pshufb m0, m24, m20 3156 vpdpwssd m7, m11, m0 ; h0 3157 pshufb m0, m25, m20 3158 vpdpwssd m8, m13, m0 ; i2 3159 pmaddwd m22, m16, m1 ; A0 3160 mova m1, m3 3161 pmaddwd m23, m16, m2 ; B0 3162 mova m2, m4 3163 pshufb m0, m24, m21 3164 vpdpwssd m7, m12, m0 ; h1 3165 pshufb m0, m25, m21 3166 vpdpwssd m8, m14, m0 ; i3 3167 vpdpwssd m22, m17, m3 ; A1 3168 mova m3, m5 3169 vpdpwssd m23, m17, m4 ; B1 3170 mova m4, m6 3171 shufpd m24, m25, 0x55 3172 pshufb m25, m24, m20 3173 vpdpwssd m7, m13, m25 ; h2 3174 vpdpwssd m8, m11, m25 ; i0 3175 vpdpwssd m22, m18, m5 ; A2 3176 vpdpwssd m23, m18, m6 ; B2 3177 pshufb m24, m21 3178 vpdpwssd m7, m14, m24 ; h3 3179 vpdpwssd m8, m12, m24 ; i1 3180 vpermt2b m7, m9, m8 ; 78 3181 vpshrdd m5, m6, m7, 16 ; 67 3182 vpdpwssd m22, m19, m5 ; A3 3183 vpdpwssd m23, m19, m7 ; B3 3184 mova m6, m7 3185 psrad m22, 10 3186 psrad m23, 10 3187 vshufi32x4 m0, m22, m23, q3232 3188 vinserti32x8 m22, ym23, 1 3189 packusdw m22, m0 3190 pminsw m22, m15 3191 mova [r8+dsq*0], ym22 3192 vextracti32x8 [r8+dsq*1], m22, 1 3193 lea r8, [r8+dsq*2] 3194 sub hd, 2 3195 jg .hv_w16_loop 3196 add srcq, 32 3197 add dstq, 32 3198 movzx hd, wb 3199 sub wd, 1<<8 3200 jg .hv_w16_loop0 3201 RET 3202 3203%if WIN64 3204DECLARE_REG_TMP 6, 4 3205%else 3206DECLARE_REG_TMP 6, 7 3207%endif 3208 3209%define PREP_8TAP_FN FN prep_8tap, 3210PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_16bpc 3211PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_16bpc 3212PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_16bpc 3213PREP_8TAP_FN regular, REGULAR, REGULAR 3214 3215cglobal prep_6tap_16bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my 3216%define base r7-prep_avx512icl 3217 imul mxd, mxm, 0x010101 3218 add mxd, t0d ; 6tap_h, mx, 4tap_h 3219 imul myd, mym, 0x010101 3220 add myd, t1d ; 6tap_v, my, 4tap_v 3221 lea r7, [prep_avx512icl] 3222 mov wd, wm 3223 movifnidn hd, hm 3224 test mxd, 0xf00 3225 jnz .h 3226 test myd, 0xf00 3227 jnz .v 3228.prep: 3229 tzcnt wd, wd 3230 mov r5d, r7m ; bitdepth_max 3231 vpbroadcastd m5, [pw_8192] 3232 movzx wd, word [r7+wq*2+table_offset(prep,)] 3233 shr r5d, 11 3234 vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4] 3235 add wq, r7 3236 lea r6, [ssq*3] 3237%if WIN64 3238 pop r7 3239%endif 3240 jmp wq 3241.h_w8: 3242 mova m6, [spel_h_shufA] 3243 movu m7, [spel_h_shufC] 3244 mova m8, [prep_endB] 3245.h_w8_loop: 3246 movu ym4, [srcq+ssq*0] 3247 vinserti32x8 m4, [srcq+ssq*1], 1 3248 movu ym5, [srcq+ssq*2] 3249 vinserti32x8 m5, [srcq+r6 ], 1 3250 lea srcq, [srcq+ssq*4] 3251 mova m0, m10 3252 mova m1, m10 3253 vpermb m2, m6, m4 3254 vpermb m3, m6, m5 3255 vpdpwssd m0, m12, m2 ; a0 b0 3256 vpdpwssd m1, m12, m3 ; c0 d0 3257 vpermb m4, m7, m4 3258 vpermb m5, m7, m5 3259 vpdpwssd m0, m14, m4 ; a2 b2 3260 vpdpwssd m1, m14, m5 ; c2 d2 3261 shufpd m2, m4, 0x55 3262 shufpd m3, m5, 0x55 3263 vpdpwssd m0, m13, m2 ; a1 b1 3264 vpdpwssd m1, m13, m3 ; c1 d1 3265 vpermt2b m0, m8, m1 3266 mova [tmpq], m0 3267 add tmpq, 64 3268 sub hd, 4 3269 jg .h_w8_loop 3270 RET 3271.h: 3272 vpbroadcastd m10, [prep_8tap_rnd] 3273 test myd, 0xf00 3274 jnz .hv 3275 lea r6, [ssq*3] 3276 cmp wd, 4 3277 je mangle(private_prefix %+ _prep_8tap_16bpc_avx512icl).h_w4 3278 shr mxd, 16 3279 pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] 3280 mov r5d, r7m 3281 sub srcq, 4 3282 shr r5d, 11 3283 psllw xmm0, [base+prep_hv_shift+r5*8] 3284 mova [tmpq], xmm0 3285 vpbroadcastd m12, xmm0 3286 vpbroadcastd m13, [tmpq+ 4] 3287 vpbroadcastd m14, [tmpq+ 8] 3288 cmp wd, 16 3289 jl .h_w8 3290 vbroadcasti32x4 m5, [spel_h_shufA] 3291 vbroadcasti32x4 m6, [spel_h_shufB] 3292 mova m7, [prep_endC] 3293 jg .h_w32 3294.h_w16_loop: 3295 movu ym2, [srcq+ssq*0+ 0] 3296 vinserti32x8 m2, [srcq+ssq*1+ 0], 1 3297 movu ym3, [srcq+ssq*0+12] 3298 vinserti32x8 m3, [srcq+ssq*1+12], 1 3299 lea srcq, [srcq+ssq*2] 3300 mova m0, m10 3301 mova m1, m10 3302 pshufb m4, m2, m5 ; 01 3303 vpdpwssd m0, m12, m4 ; a0 b0 3304 pshufb m4, m3, m6 ; 89 3305 vpdpwssd m1, m14, m4 ; a2' b2' 3306 pshufb m2, m6 ; 23 3307 pshufb m3, m5 ; 67 3308 vpdpwssd m0, m13, m2 ; a1 b1 3309 vpdpwssd m1, m13, m3 ; a1' b1' 3310 shufpd m2, m3, 0x55 ; 45 3311 vpdpwssd m0, m14, m2 ; a2 b2 3312 vpdpwssd m1, m12, m2 ; a0' b0' 3313 vpermt2b m0, m7, m1 3314 mova [tmpq], m0 3315 add tmpq, 64 3316 sub hd, 2 3317 jg .h_w16_loop 3318 RET 3319.h_w32: 3320 lea srcq, [srcq+wq*2] 3321 neg wq 3322.h_w32_loop0: 3323 mov r6, wq 3324.h_w32_loop: 3325 movu m2, [srcq+r6*2+ 0] 3326 movu m3, [srcq+r6*2+12] 3327 mova m0, m10 3328 mova m1, m10 3329 pshufb m4, m2, m5 3330 vpdpwssd m0, m12, m4 3331 pshufb m4, m3, m6 3332 vpdpwssd m1, m14, m4 3333 pshufb m2, m6 3334 pshufb m3, m5 3335 vpdpwssd m0, m13, m2 3336 vpdpwssd m1, m13, m3 3337 shufpd m2, m3, 0x55 3338 vpdpwssd m0, m14, m2 3339 vpdpwssd m1, m12, m2 3340 vpermt2b m0, m7, m1 3341 mova [tmpq], m0 3342 add tmpq, 64 3343 add r6, 32 3344 jl .h_w32_loop 3345 add srcq, ssq 3346 dec hd 3347 jg .h_w32_loop0 3348 RET 3349.v: 3350 movzx mxd, myb 3351 shr myd, 16 3352 cmp hd, 4 3353 cmove myd, mxd 3354 mov r5d, r7m 3355 vpbroadcastd m10, [prep_8tap_rnd] 3356 pmovsxbw xmm0, [base+subpel_filters+1+myq*8] 3357 tzcnt r6d, wd 3358 shr r5d, 11 3359 movzx r6d, word [r7+r6*2+table_offset(prep, _6tap_v)] 3360 psllw xmm0, [base+prep_hv_shift+r5*8] 3361 add r7, r6 3362 mova [tmpq], xmm0 3363 vpbroadcastd m12, xmm0 3364 mov r6, ssq 3365 vpbroadcastd m13, [tmpq+ 4] 3366 neg r6 3367 vpbroadcastd m14, [tmpq+ 8] 3368 jmp r7 3369.v_w4: 3370 mov r3d, 0x330c 3371 movq xm1, [srcq+r6 *2] 3372 kmovw k1, r3d 3373 vpbroadcastq ym1{k1}, [srcq+r6 *1] 3374 vpbroadcastq m2, [srcq+ssq*0] 3375 vinserti32x4 m1{k1}, m2, [srcq+ssq*1], 3 3376 movq xm0, [srcq+ssq*2] 3377 mova ym4, [prep_endA] 3378 valignq m0, m1, 2 3379 punpcklwd m1, m0 ; 01 12 23 34 3380.v_w4_loop: 3381 lea srcq, [srcq+ssq*4] 3382 movq xm2, [srcq+r6 *1] 3383 vpbroadcastq ym2{k1}, [srcq+ssq*0] 3384 vpbroadcastq m3, [srcq+ssq*1] 3385 vinserti32x4 m2{k1}, m3, [srcq+ssq*2], 3 3386 mova m3, m10 3387 vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 3388 valignq m0, m2, m0, 6 ; 4 5 6 7 3389 punpcklwd m0, m2 ; 45 56 67 78 3390 vpdpwssd m3, m14, m0 ; a2 b2 c2 d2 3391 vshufi32x4 m1, m0, q1032 ; 23 34 45 56 3392 vpdpwssd m3, m13, m1 ; a1 b1 c1 d1 3393 mova m1, m0 3394 mova m0, m2 3395 vpermb m3, m4, m3 3396 mova [tmpq], ym3 3397 add tmpq, 32 3398 sub hd, 4 3399 jg .v_w4_loop 3400 RET 3401.v_w8: 3402 vbroadcasti32x4 ym1, [srcq+r6 *1] 3403 mov r3d, 0x33 3404 vbroadcasti32x4 m2, [srcq+ssq*0] 3405 kmovb k1, r3d 3406 mova m6, [spel_v_shuf8] 3407 vinserti64x2 m1{k1}, m2, [srcq+r6 *2], 0 ; 0 1 2 3408 vbroadcasti32x4 ym0, [srcq+ssq*1] 3409 vinserti64x2 m0{k1}, m2, [srcq+ssq*2], 2 ; 2 3 4 3410 mova m7, [prep_endB] 3411 vpermb m1, m6, m1 ; 01 12 3412 vpermb m2, m6, m0 ; 23 34 3413.v_w8_loop: 3414 lea srcq, [srcq+ssq*4] 3415 vbroadcasti32x4 ym3, [srcq+r6 *1] 3416 movu xm4, [srcq+ssq*0] 3417 vshufi64x2 m3{k1}, m0, m4, q1032 ; 4 5 6 3418 vbroadcasti32x4 ym0, [srcq+ssq*1] 3419 vinserti64x2 m0{k1}, m4, [srcq+ssq*2], 2 ; 6 7 8 3420 mova m4, m10 3421 vpdpwssd m4, m12, m1 ; a0 b0 3422 mova m5, m10 3423 vpdpwssd m5, m12, m2 ; c0 d0 3424 vpermb m1, m6, m3 ; 45 56 3425 vpdpwssd m4, m13, m2 ; a1 b1 3426 vpermb m2, m6, m0 ; 67 78 3427 vpdpwssd m5, m13, m1 ; c1 d1 3428 vpdpwssd m4, m14, m1 ; a2 b2 3429 vpdpwssd m5, m14, m2 ; c2 d2 3430 vpermt2b m4, m7, m5 3431 mova [tmpq], m4 3432 add tmpq, 64 3433 sub hd, 4 3434 jg .v_w8_loop 3435 RET 3436.v_w16: 3437 vbroadcasti32x8 m0, [srcq+r6 *1] 3438 vinserti32x8 m1, m0, [srcq+ssq*0], 1 ; 1 2 3439 vinserti32x8 m0, [srcq+r6 *2], 0 ; 0 1 3440 mova m6, [spel_v_shuf16] 3441 movu ym3, [srcq+ssq*1] 3442 lea srcq, [srcq+ssq*2] 3443 vinserti32x8 m3, [srcq+ssq*0], 1 ; 3 4 3444 mova m7, [prep_endA] 3445 vpermb m1, m6, m1 ; 12 3446 vpermb m0, m6, m0 ; 01 3447 vpermb m3, m6, m3 ; 34 3448 vpshrdd m2, m1, m3, 16 ; 23 3449.v_w16_loop: 3450 mova m5, m10 3451 vpdpwssd m5, m12, m1 ; b0 3452 mova m4, m10 3453 vpdpwssd m4, m12, m0 ; a0 3454 mova m1, m3 3455 vpdpwssd m5, m13, m3 ; b1 3456 movu ym3, [srcq+ssq*1] 3457 lea srcq, [srcq+ssq*2] 3458 vpdpwssd m4, m13, m2 ; a1 3459 vinserti32x8 m3, [srcq+ssq*0], 1 3460 mova m0, m2 3461 vpermb m3, m6, m3 ; 56 3462 vpshrdd m2, m1, m3, 16 ; 45 3463 vpdpwssd m5, m14, m3 ; b2 3464 vpdpwssd m4, m14, m2 ; a2 3465 vpermt2b m4, m7, m5 3466 mova [tmpq], m4 3467 add tmpq, 64 3468 sub hd, 2 3469 jg .v_w16_loop 3470 RET 3471.v_w32: 3472.v_w64: 3473.v_w128: 3474%if WIN64 3475 push r8 3476%endif 3477 mova m11, [prep_endC] 3478 lea r5, [hq+wq*8-256] 3479.v_w32_loop0: 3480 movu m4, [srcq+r6 *2] 3481 movu m5, [srcq+r6 *1] 3482 lea r7, [srcq+ssq*2] 3483 movu m6, [srcq+ssq*0] 3484 movu m7, [srcq+ssq*1] 3485 mov r8, tmpq 3486 movu m8, [r7 +ssq*0] 3487 punpcklwd m0, m4, m5 ; 01 3488 punpckhwd m4, m5 3489 punpcklwd m1, m5, m6 ; 12 3490 punpckhwd m5, m6 3491 punpcklwd m2, m6, m7 ; 23 3492 punpckhwd m6, m7 3493 punpcklwd m3, m7, m8 ; 34 3494 punpckhwd m7, m8 3495.v_w32_loop: 3496 mova m16, m10 3497 movu m9, [r7+ssq*1] 3498 mova m18, m10 3499 vpdpwssd m16, m12, m0 ; a0 3500 mova m17, m10 3501 vpdpwssd m18, m12, m4 3502 mova m19, m10 3503 vpdpwssd m17, m12, m1 ; b0 3504 lea r7, [r7+ssq*2] 3505 vpdpwssd m19, m12, m5 3506 mova m0, m2 3507 vpdpwssd m16, m13, m2 ; a1 3508 punpcklwd m2, m8, m9 ; 45 3509 mova m4, m6 3510 vpdpwssd m18, m13, m6 3511 punpckhwd m6, m8, m9 3512 movu m8, [r7+ssq*0] 3513 vpdpwssd m17, m13, m3 ; b1 3514 mova m1, m3 3515 vpdpwssd m19, m13, m7 3516 mova m5, m7 3517 vpdpwssd m16, m14, m2 ; a2 3518 punpcklwd m3, m9, m8 ; 56 3519 vpdpwssd m18, m14, m6 3520 punpckhwd m7, m9, m8 3521 vpdpwssd m17, m14, m3 ; b2 3522 vpdpwssd m19, m14, m7 3523 vpermt2b m16, m11, m18 3524 vpermt2b m17, m11, m19 3525 mova [r8+wq*0], m16 3526 mova [r8+wq*2], m17 3527 lea r8, [r8+wq*4] 3528 sub hd, 2 3529 jg .v_w32_loop 3530 add srcq, 64 3531 add tmpq, 64 3532 movzx hd, r5b 3533 sub r5d, 1<<8 3534 jg .v_w32_loop0 3535%if WIN64 3536 pop r8 3537%endif 3538 vzeroupper 3539 RET 3540.hv_w4: 3541 movzx mxd, mxb 3542 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 3543 movzx mxd, myb 3544 shr myd, 16 3545 cmp hd, 4 3546 cmove myd, mxd 3547 mov r5d, r7m 3548 pmovsxbw xmm1, [base+subpel_filters+1+myq*8] 3549 mov r6, ssq 3550 sub srcq, 2 3551 shr r5d, 11 3552 neg r6 3553 psllw xmm0, [base+prep_hv_shift+r5*8] 3554 psllw xmm1, 2 3555 mova [tmpq+ 0], xmm0 3556 mova [tmpq+16], xmm1 3557 vpbroadcastd m8, [tmpq+ 4] 3558 mov r3d, 0xf0 3559 vpbroadcastd m9, [tmpq+ 8] 3560 vpbroadcastd m12, xmm1 3561 movu xm3, [srcq+r6 *2] 3562 kmovb k1, r3d 3563 vinserti32x4 ym3, [srcq+r6 *1], 1 3564 vbroadcasti32x4 m2, [srcq+ssq*0] 3565 vinserti64x2 m3{k1}, m2, [srcq+ssq*1], 3 3566 movu xm4, [srcq+ssq*2] 3567 vbroadcasti32x4 m5, [spel_h_shufA] 3568 vbroadcasti32x4 m6, [spel_h_shufB] 3569 mova m1, m11 3570 mova m15, [spel_shuf4a] 3571 mova xm2, xm11 3572 pshufb m0, m3, m5 3573 vpdpwssd m1, m8, m0 3574 pshufb xm0, xm4, xm5 3575 vpdpwssd xm2, xm8, xm0 3576 vpbroadcastd m13, [tmpq+20] 3577 pshufb m3, m6 3578 vpbroadcastd m14, [tmpq+24] 3579 pshufb xm4, xm6 3580 mova m7, [spel_shuf4b] 3581 vpdpwssd m1, m9, m3 ; 0 1 2 3 3582 vpdpwssd xm2, xm9, xm4 ; 4 3583 vpermt2b m1, m15, m2 ; 01 12 23 34 3584 mova ym15, [prep_endA] 3585.hv_w4_loop: 3586 lea srcq, [srcq+ssq*4] 3587 movu xm4, [srcq+r6 *1] 3588 vinserti32x4 ym4, [srcq+ssq*0], 1 3589 vbroadcasti32x4 m3, [srcq+ssq*1] 3590 vinserti64x2 m4{k1}, m3, [srcq+ssq*2], 3 3591 mova m2, m11 3592 pshufb m3, m4, m5 3593 vpdpwssd m2, m8, m3 3594 mova m3, m10 3595 vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 3596 pshufb m4, m6 3597 vpdpwssd m2, m9, m4 ; 5 6 7 8 3598 mova m4, m1 3599 vpermt2b m1, m7, m2 ; 45 56 67 78 3600 vpdpwssd m3, m14, m1 ; a2 b2 c2 d2 3601 vshufi32x4 m4, m1, q1032 ; 23 34 45 56 3602 vpdpwssd m3, m13, m4 ; a1 b1 c1 d1 3603 vpermb m3, m15, m3 3604 mova [tmpq], ym3 3605 add tmpq, 32 3606 sub hd, 4 3607 jg .hv_w4_loop 3608 RET 3609.hv_w8: 3610 mova m8, [spel_h_shufA] 3611 movu ym18, [srcq+r6 *2] 3612 vinserti32x8 m18, [srcq+r6 *1], 1 ; 0 1 3613 movu ym19, [srcq+ssq*0] 3614 vinserti32x8 m19, [srcq+ssq*1], 1 ; 2 3 3615 movu ym20, [srcq+ssq*2] ; 4 3616 movu m9, [spel_h_shufC] 3617 mova m21, [spel_shuf8a] 3618 mova m0, [spel_shuf8b] 3619 vpermb m4, m8, m18 3620 mova m1, m10 3621 vpermb m5, m8, m19 3622 vpdpwssd m1, m12, m4 ; a0 b0 3623 mova m2, m10 3624 vpermb m6, m8, m20 3625 vpdpwssd m2, m12, m5 ; c0 d0 3626 mova m3, m10 3627 vpermb m18, m9, m18 3628 vpdpwssd m3, m12, m6 ; e0 3629 mova m7, [prep_endB] 3630 vpermb m19, m9, m19 3631 vpdpwssd m1, m14, m18 ; a2 b2 3632 vpermb m20, m9, m20 3633 vpdpwssd m2, m14, m19 ; c2 d2 3634 shufpd m4, m18, 0x55 3635 vpdpwssd m3, m14, m20 ; e2 3636 shufpd m5, m19, 0x55 3637 vpdpwssd m1, m13, m4 ; a1 b1 3638 shufpd m6, m20, 0x55 3639 vpdpwssd m2, m13, m5 ; c1 d1 3640 vpdpwssd m3, m13, m6 ; e1 3641 vpermt2b m1, m21, m2 ; 01 12 3642 vpermt2b m2, m21, m3 ; 23 34 3643.hv_w8_loop: 3644 lea srcq, [srcq+ssq*4] 3645 movu ym18, [srcq+r6 *1] 3646 vinserti32x8 m18, [srcq+ssq*0], 1 3647 movu ym19, [srcq+ssq*1] 3648 vinserti32x8 m19, [srcq+ssq*2], 1 3649 mova m3, m10 3650 vpermb m5, m8, m18 3651 mova m4, m10 3652 vpermb m6, m8, m19 3653 vpdpwssd m3, m12, m5 ; f0 g0 3654 mova m20, m11 3655 vpdpwssd m4, m12, m6 ; h0 i0 3656 mova m21, m11 3657 vpdpwssd m20, m15, m1 ; A0 B0 3658 vpermb m18, m9, m18 3659 vpdpwssd m21, m15, m2 ; C0 D0 3660 vpermb m19, m9, m19 3661 vpdpwssd m3, m14, m18 ; f2 g2 3662 vpdpwssd m4, m14, m19 ; h2 i2 3663 shufpd m5, m18, 0x55 3664 vpdpwssd m20, m16, m2 ; A1 B1 3665 shufpd m6, m19, 0x55 3666 vpdpwssd m3, m13, m5 ; f1 g1 3667 vpdpwssd m4, m13, m6 ; h1 i1 3668 vpermt2b m2, m0, m3 ; 45 56 3669 vpdpwssd m21, m16, m2 ; C1 D1 3670 mova m1, m2 3671 vpermt2b m2, m0, m4 ; 67 78 3672 vpdpwssd m20, m17, m1 ; A2 B2 3673 vpdpwssd m21, m17, m2 ; A2 B2 3674 vpermt2b m20, m7, m21 3675 mova [tmpq], m20 3676 add tmpq, 64 3677 sub hd, 4 3678 jg .hv_w8_loop 3679 vzeroupper 3680 RET 3681.hv: 3682 vpbroadcastd m11, [pd_128] 3683 cmp wd, 4 3684 je .hv_w4 3685 shr mxd, 16 3686 pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] 3687 movzx mxd, myb 3688 shr myd, 16 3689 cmp hd, 6 3690 cmovs myd, mxd 3691 mov r5d, r7m 3692 pmovsxbw xmm1, [base+subpel_filters+1+myq*8] 3693 mov r6, ssq 3694 sub srcq, 4 3695 shr r5d, 11 3696 neg r6 3697 psllw xmm0, [base+prep_hv_shift+r5*8] 3698 psllw xmm1, 2 3699 mova [tmpq+ 0], xmm0 3700 mova [tmpq+16], xmm1 3701 vpbroadcastd m12, xmm0 3702 vpbroadcastd m13, [tmpq+ 4] 3703 vpbroadcastd m14, [tmpq+ 8] 3704 vpbroadcastd m15, xmm1 3705 vpbroadcastd m16, [tmpq+20] 3706 vpbroadcastd m17, [tmpq+24] 3707 cmp wd, 16 3708 jl .hv_w8 3709 vbroadcasti32x4 m8, [spel_h_shufA] 3710 vbroadcasti32x4 m9, [spel_h_shufB] 3711 jg .hv_w32 3712 vbroadcasti32x8 m6, [srcq+r6 *2+ 8] 3713 vinserti32x8 m2, m6, [srcq+r6 *2+16], 1 3714 vinserti32x8 m6, [srcq+r6 *2+ 0], 0 ; 0 3715 movu ym18, [srcq+r6 *1+ 0] 3716 movu ym19, [srcq+r6 *1+12] 3717 vinserti32x8 m18, [srcq+ssq*0+ 0], 1 3718 vinserti32x8 m19, [srcq+ssq*0+12], 1 ; 1 2 3719 movu ym20, [srcq+ssq*1+ 0] 3720 movu ym21, [srcq+ssq*1+12] 3721 lea srcq, [srcq+ssq*2] 3722 vinserti32x8 m20, [srcq+ssq*0+ 0], 1 3723 vinserti32x8 m21, [srcq+ssq*0+12], 1 ; 3 4 3724 pshufb m2, m8 3725 mova m1, m10 3726 pshufb m3, m18, m8 3727 vpdpwssd m1, m14, m2 ; a2 3728 mova m2, m10 3729 pshufb m4, m19, m9 3730 vpdpwssd m2, m12, m3 ; b0 c0 3731 mova m3, m10 3732 pshufb m5, m20, m8 3733 vpdpwssd m3, m14, m4 ; b2' c2' 3734 mova m4, m10 3735 pshufb m7, m21, m9 3736 vpdpwssd m4, m12, m5 ; d0 e0 3737 mova m5, m10 3738 pshufb m0, m6, m8 3739 vpdpwssd m5, m14, m7 ; d2' e2' 3740 mova m7, [spel_shuf16] 3741 pshufb m18, m9 3742 vpdpwssd m1, m12, m0 ; a0 3743 pshufb m19, m8 3744 vpdpwssd m2, m13, m18 ; b1 c1 3745 pshufb m20, m9 3746 vpdpwssd m3, m13, m19 ; b1' c1' 3747 pshufb m21, m8 3748 vpdpwssd m4, m13, m20 ; d1 e1 3749 pshufb m6, m9 3750 vpdpwssd m5, m13, m21 ; d1' e1' 3751 mova m0, [prep_endB] 3752 shufpd m18, m19, 0x55 3753 vpdpwssd m1, m13, m6 ; a1 3754 shufpd m20, m21, 0x55 3755 vpdpwssd m2, m14, m18 ; b2 c2 3756 vpdpwssd m3, m12, m18 ; b0' c0' 3757 vpdpwssd m4, m14, m20 ; d2 e2 3758 vpdpwssd m5, m12, m20 ; d0' e0' 3759 pslldq m1, 1 3760 vpermt2b m2, m7, m3 ; 12 3761 vpermt2b m4, m7, m5 ; 34 3762 vpshrdd m1, m2, 16 ; 01 3763 vpshrdd m3, m2, m4, 16 ; 23 3764.hv_w16_loop: 3765 movu ym18, [srcq+ssq*1+ 0] 3766 movu ym19, [srcq+ssq*1+12] 3767 lea srcq, [srcq+ssq*2] 3768 vinserti32x8 m18, [srcq+ssq*0+ 0], 1 3769 vinserti32x8 m19, [srcq+ssq*0+12], 1 3770 mova m5, m10 3771 mova m6, m10 3772 pshufb m21, m18, m8 3773 vpdpwssd m5, m12, m21 ; f0 g0 3774 pshufb m20, m19, m9 3775 mova m21, m11 3776 vpdpwssd m6, m14, m20 ; f2' g2' 3777 mova m20, m11 3778 vpdpwssd m21, m15, m2 ; B0 3779 mova m2, m4 3780 vpdpwssd m20, m15, m1 ; A0 3781 mova m1, m3 3782 pshufb m18, m9 3783 vpdpwssd m5, m13, m18 ; f1 g1 3784 pshufb m19, m8 3785 vpdpwssd m6, m13, m19 ; f1' g1' 3786 vpdpwssd m21, m16, m4 ; B1 3787 vpdpwssd m20, m16, m3 ; A1 3788 shufpd m18, m19, 0x55 3789 vpdpwssd m5, m14, m18 ; f2 g2 3790 vpdpwssd m6, m12, m18 ; f0' g0' 3791 mova m4, m7 3792 vpermi2b m4, m5, m6 ; 56 3793 vpshrdd m3, m2, m4, 16 ; 45 3794 vpdpwssd m21, m17, m4 ; B2 3795 vpdpwssd m20, m17, m3 ; A2 3796 vpermt2b m20, m0, m21 3797 mova [tmpq], m20 3798 add tmpq, 64 3799 sub hd, 2 3800 jg .hv_w16_loop 3801 vzeroupper 3802 RET 3803.hv_w32: 3804 WIN64_SPILL_XMM 29 3805%if WIN64 3806 push r8 3807%endif 3808 mova m27, [spel_shuf32] 3809 lea r5d, [hq+wq*8-256] 3810 mova m28, [prep_endC] 3811.hv_w32_loop0: 3812 movu m18, [srcq+r6 *2+ 0] 3813 movu m7, [srcq+r6 *2+12] 3814 movu m6, [srcq+r6 *1+ 0] 3815 movu m20, [srcq+r6 *1+12] 3816 lea r7, [srcq+ssq*2] 3817 movu m19, [srcq+ssq*0+ 0] 3818 movu m21, [srcq+ssq*0+12] 3819 movu m22, [srcq+ssq*1+ 0] 3820 movu m24, [srcq+ssq*1+12] 3821 mov r8, tmpq 3822 movu m23, [r7 +ssq*0+ 0] 3823 movu m25, [r7 +ssq*0+12] 3824 pshufb m1, m18, m8 3825 mova m0, m10 3826 pshufb m2, m7, m9 3827 vpdpwssd m0, m12, m1 ; a0 3828 mova m1, m10 3829 pshufb m4, m6, m8 3830 vpdpwssd m1, m14, m2 ; a2' 3831 mova m2, m10 3832 pshufb m3, m19, m8 3833 vpdpwssd m2, m12, m4 ; b0 3834 mova m4, m10 3835 pshufb m5, m20, m9 3836 vpdpwssd m4, m12, m3 ; c0 3837 mova m3, m10 3838 pshufb m26, m21, m9 3839 vpdpwssd m3, m14, m5 ; b2' 3840 mova m5, m10 3841 pshufb m18, m9 3842 vpdpwssd m5, m14, m26 ; c2' 3843 pshufb m7, m8 3844 vpdpwssd m0, m13, m18 ; a1 3845 pshufb m6, m9 3846 vpdpwssd m1, m13, m7 ; a1' 3847 pshufb m19, m9 3848 vpdpwssd m2, m13, m6 ; b1 3849 pshufb m20, m8 3850 vpdpwssd m4, m13, m19 ; c1 3851 pshufb m21, m8 3852 vpdpwssd m3, m13, m20 ; b1' 3853 shufpd m18, m7, 0x55 3854 vpdpwssd m5, m13, m21 ; c1' 3855 shufpd m6, m20, 0x55 3856 vpdpwssd m0, m14, m18 ; a2 3857 shufpd m19, m21, 0x55 3858 vpdpwssd m1, m12, m18 ; a0' 3859 pshufb m18, m22, m8 3860 vpdpwssd m2, m14, m6 ; b2 3861 pshufb m7, m23, m8 3862 vpdpwssd m4, m14, m19 ; c2 3863 vpdpwssd m3, m12, m6 ; b0' 3864 mova m6, m10 3865 vpdpwssd m5, m12, m19 ; c0' 3866 pshufb m19, m24, m9 3867 vpdpwssd m6, m12, m18 ; d0 3868 mova m18, m10 3869 pshufb m26, m25, m9 3870 vpdpwssd m18, m12, m7 ; e0 3871 mova m7, m10 3872 pshufb m22, m9 3873 vpdpwssd m7, m14, m19 ; d2' 3874 mova m19, m10 3875 pshufb m23, m9 3876 vpdpwssd m19, m14, m26 ; e2' 3877 pshufb m24, m8 3878 vpdpwssd m6, m13, m22 ; d1 3879 pshufb m25, m8 3880 vpdpwssd m18, m13, m23 ; e1 3881 shufpd m22, m24, 0x55 3882 vpdpwssd m7, m13, m24 ; d1' 3883 shufpd m23, m25, 0x55 3884 vpdpwssd m19, m13, m25 ; e1' 3885 pslldq m0, 1 3886 vpdpwssd m6, m14, m22 ; d2 3887 pslldq m1, 1 3888 vpdpwssd m18, m14, m23 ; e2 3889 vpermt2b m2, m27, m4 ; 12 3890 vpdpwssd m7, m12, m22 ; d0' 3891 vpermt2b m3, m27, m5 ; 12' 3892 vpdpwssd m19, m12, m23 ; e0' 3893 vpshrdd m0, m2, 16 ; 01 3894 vpermt2b m6, m27, m18 ; 34 3895 vpshrdd m1, m3, 16 ; 01' 3896 vpermt2b m7, m27, m19 ; 34' 3897 vpshrdd m4, m2, m6, 16 ; 23 3898 vpshrdd m5, m3, m7, 16 ; 23' 3899.hv_w32_loop: 3900 movu m22, [r7+ssq*1+ 0] 3901 movu m24, [r7+ssq*1+12] 3902 lea r7, [r7+ssq*2] 3903 movu m23, [r7+ssq*0+ 0] 3904 movu m25, [r7+ssq*0+12] 3905 mova m19, m11 3906 vpdpwssd m19, m15, m2 ; B0 3907 mova m21, m11 3908 vpdpwssd m21, m15, m3 ; B0' 3909 mova m18, m11 3910 vpdpwssd m18, m15, m0 ; A0 3911 mova m20, m11 3912 vpdpwssd m20, m15, m1 ; A0' 3913 mova m2, m6 3914 vpdpwssd m19, m16, m6 ; B1 3915 mova m3, m7 3916 vpdpwssd m21, m16, m7 ; B1' 3917 mova m0, m4 3918 vpdpwssd m18, m16, m4 ; A1 3919 mova m1, m5 3920 pshufb m4, m22, m8 3921 vpdpwssd m20, m16, m5 ; A1' 3922 mova m6, m10 3923 pshufb m7, m23, m8 3924 vpdpwssd m6, m12, m4 ; f0 3925 mova m4, m10 3926 pshufb m5, m24, m9 3927 vpdpwssd m4, m12, m7 ; g0 3928 mova m7, m10 3929 pshufb m26, m25, m9 3930 vpdpwssd m7, m14, m5 ; f2' 3931 mova m5, m10 3932 pshufb m22, m9 3933 vpdpwssd m5, m14, m26 ; g2' 3934 pshufb m23, m9 3935 vpdpwssd m6, m13, m22 ; f1 3936 pshufb m24, m8 3937 vpdpwssd m4, m13, m23 ; g1 3938 pshufb m25, m8 3939 vpdpwssd m7, m13, m24 ; f1' 3940 shufpd m22, m24, 0x55 3941 vpdpwssd m5, m13, m25 ; g1' 3942 shufpd m23, m25, 0x55 3943 vpdpwssd m6, m14, m22 ; f2 3944 vpdpwssd m4, m14, m23 ; g2 3945 vpdpwssd m7, m12, m22 ; f0' 3946 vpdpwssd m5, m12, m23 ; g0' 3947 vpermt2b m6, m27, m4 ; 56 3948 vpermt2b m7, m27, m5 ; 56' 3949 vpdpwssd m19, m17, m6 ; B2 3950 vpshrdd m4, m2, m6, 16 ; 45 3951 vpdpwssd m21, m17, m7 ; B2' 3952 vpshrdd m5, m3, m7, 16 ; 45' 3953 vpdpwssd m18, m17, m4 ; A2 3954 vpdpwssd m20, m17, m5 ; A2' 3955 vpermt2b m19, m28, m21 3956 vpermt2b m18, m28, m20 3957 mova [r8+wq*0], m18 3958 mova [r8+wq*2], m19 3959 lea r8, [r8+wq*4] 3960 sub hd, 2 3961 jg .hv_w32_loop 3962 add srcq, 64 3963 add tmpq, 64 3964 movzx hd, r5b 3965 sub r5d, 1<<8 3966 jg .hv_w32_loop0 3967%if WIN64 3968 pop r8 3969%endif 3970 RET 3971 3972PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_16bpc 3973PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_16bpc 3974PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_16bpc 3975PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_16bpc 3976PREP_8TAP_FN sharp, SHARP, SHARP 3977 3978cglobal prep_8tap_16bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my 3979%define base r7-prep_avx512icl 3980 imul mxd, mxm, 0x010101 3981 add mxd, t0d ; 8tap_h, mx, 4tap_h 3982 imul myd, mym, 0x010101 3983 add myd, t1d ; 8tap_v, my, 4tap_v 3984 lea r7, [prep_avx512icl] 3985 mov wd, wm 3986 movifnidn hd, hm 3987 test mxd, 0xf00 3988 jnz .h 3989 test myd, 0xf00 3990 jz mangle(private_prefix %+ _prep_6tap_16bpc_avx512icl).prep 3991.v: 3992 movzx mxd, myb 3993 shr myd, 16 3994 cmp hd, 4 3995 cmove myd, mxd 3996 mov r5d, r7m 3997 vpbroadcastd m10, [prep_8tap_rnd] 3998 pmovsxbw xmm0, [base+subpel_filters+myq*8] 3999 tzcnt r6d, wd 4000 shr r5d, 11 4001 movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)] 4002 psllw xmm0, [base+prep_hv_shift+r5*8] 4003 add r7, r6 4004 lea r6, [strideq*3] 4005 sub srcq, r6 4006 mova [tmpq], xmm0 4007 vpbroadcastd m12, xmm0 4008 vpbroadcastd m13, [tmpq+ 4] 4009 vpbroadcastd m14, [tmpq+ 8] 4010 vpbroadcastd m15, [tmpq+12] 4011 jmp r7 4012.v_w4: 4013 mov r3d, 0x330c 4014 movq xm1, [srcq+strideq*0] 4015 kmovw k1, r3d 4016 vpbroadcastq ym1{k1}, [srcq+strideq*1] 4017 vpbroadcastq m0, [srcq+r6 ] 4018 vinserti32x4 m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 3 4019 lea srcq, [srcq+strideq*4] 4020 vpbroadcastq ym0{k1}, [srcq+strideq*0] 4021 vpbroadcastq m2, [srcq+strideq*1] 4022 vinserti32x4 m0{k1}, m2, [srcq+strideq*2], 3 ; 3 4 5 6 4023 mova ym5, [prep_endA] 4024 vshufi32x4 m3, m1, m0, q1021 ; 1 2 3 4 4025 vshufi32x4 m2, m1, m0, q2132 ; 2 3 4 5 4026 punpcklwd m1, m3 ; 01 12 23 34 4027 punpcklwd m2, m0 ; 23 34 45 56 4028.v_w4_loop: 4029 movq xm4, [srcq+r6 ] 4030 lea srcq, [srcq+strideq*4] 4031 vpbroadcastq ym4{k1}, [srcq+strideq*0] 4032 vpbroadcastq m3, [srcq+strideq*1] 4033 vinserti32x4 m4{k1}, m3, [srcq+strideq*2], 3 ; 7 8 9 a 4034 mova m3, m10 4035 vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 4036 valignq m1, m4, m0, 6 ; 6 7 8 9 4037 vpdpwssd m3, m13, m2 ; a1 b1 c1 d1 4038 mova m0, m4 4039 punpcklwd m4, m1, m4 ; 67 78 89 9a 4040 vpdpwssd m3, m15, m4 ; a3 b3 c3 d3 4041 vshufi32x4 m1, m2, m4, q1032 ; 45 56 67 78 4042 vpdpwssd m3, m14, m1 ; a2 b2 c2 d2 4043 mova m2, m4 4044 vpermb m3, m5, m3 4045 mova [tmpq], ym3 4046 add tmpq, 32 4047 sub hd, 4 4048 jg .v_w4_loop 4049 RET 4050.v_w8: 4051 movu xm0, [srcq+strideq*0] 4052 mov r3d, 0x33 4053 vbroadcasti32x4 ym1, [srcq+strideq*1] 4054 kmovb k1, r3d 4055 mova m7, [spel_v_shuf8] 4056 vinserti64x2 m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 4057 add srcq, r6 4058 vbroadcasti32x4 ym2, [srcq+strideq*0] 4059 vbroadcasti32x4 m3, [srcq+strideq*1] 4060 vbroadcasti32x4 ym0, [srcq+strideq*2] 4061 vshufi64x2 m2{k1}, m1, m3, q1032 ; 2 3 4 4062 vinserti64x2 m0{k1}, m3, [srcq+r6], 2 ; 4 5 6 4063 mova m8, [prep_endB] 4064 vpermb m1, m7, m1 ; 01 12 4065 vpermb m2, m7, m2 ; 23 34 4066 vpermb m3, m7, m0 ; 45 56 4067.v_w8_loop: 4068 lea srcq, [srcq+strideq*4] 4069 vbroadcasti32x4 ym4, [srcq+strideq*0] 4070 movu xm5, [srcq+strideq*1] 4071 vshufi64x2 m4{k1}, m0, m5, q1032 ; 6 7 8 4072 vbroadcasti32x4 ym0, [srcq+strideq*2] 4073 vinserti64x2 m0{k1}, m5, [srcq+r6], 2 ; 8 9 a 4074 mova m5, m10 4075 vpdpwssd m5, m12, m1 ; a0 b0 4076 mova m6, m10 4077 vpdpwssd m6, m12, m2 ; c0 d0 4078 mova m1, m3 4079 vpdpwssd m5, m13, m2 ; c1 d1 4080 vpdpwssd m6, m13, m3 ; c1 d1 4081 vpermb m2, m7, m4 ; 67 78 4082 vpdpwssd m5, m14, m3 ; a2 b2 4083 vpermb m3, m7, m0 ; 89 9a 4084 vpdpwssd m6, m14, m2 ; c2 d2 4085 vpdpwssd m5, m15, m2 ; a3 b3 4086 vpdpwssd m6, m15, m3 ; c3 d3 4087 vpermt2b m5, m8, m6 4088 mova [tmpq], m5 4089 add tmpq, 64 4090 sub hd, 4 4091 jg .v_w8_loop 4092 RET 4093.v_w16: 4094 vbroadcasti32x8 m0, [srcq+strideq*1] 4095 vinserti32x8 m1, m0, [srcq+strideq*2], 1 4096 vinserti32x8 m0, [srcq+strideq*0], 0 4097 mova m8, [spel_v_shuf16] 4098 add srcq, r6 4099 movu ym3, [srcq+strideq*0] 4100 vinserti32x8 m3, [srcq+strideq*1], 1 4101 movu ym5, [srcq+strideq*2] 4102 add srcq, r6 4103 vinserti32x8 m5, [srcq+strideq*0], 1 4104 mova m11, [prep_endA] 4105 vpermb m1, m8, m1 ; 12 4106 vpermb m0, m8, m0 ; 01 4107 vpermb m3, m8, m3 ; 34 4108 vpermb m5, m8, m5 ; 56 4109 vpshrdd m2, m1, m3, 16 ; 23 4110 vpshrdd m4, m3, m5, 16 ; 45 4111.v_w16_loop: 4112 mova m7, m10 4113 vpdpwssd m7, m12, m1 ; b0 4114 mova m6, m10 4115 vpdpwssd m6, m12, m0 ; a0 4116 mova m1, m3 4117 vpdpwssd m7, m13, m3 ; b1 4118 mova m0, m2 4119 vpdpwssd m6, m13, m2 ; a1 4120 mova m3, m5 4121 vpdpwssd m7, m14, m5 ; b2 4122 mova m2, m4 4123 vpdpwssd m6, m14, m4 ; a2 4124 movu ym5, [srcq+strideq*1] 4125 lea srcq, [srcq+strideq*2] 4126 vinserti32x8 m5, [srcq+strideq*0], 1 4127 vpermb m5, m8, m5 ; 78 4128 vpshrdd m4, m3, m5, 16 ; 67 4129 vpdpwssd m7, m15, m5 ; b3 4130 vpdpwssd m6, m15, m4 ; a3 4131 vpermt2b m6, m11, m7 4132 mova [tmpq], m6 4133 add tmpq, 64 4134 sub hd, 2 4135 jg .v_w16_loop 4136 RET 4137.v_w32: 4138.v_w64: 4139.v_w128: 4140 WIN64_PUSH_XMM 23 4141%if WIN64 4142 push r8 4143%endif 4144 mova m11, [prep_endC] 4145 lea r5, [hq+wq*8-256] 4146.v_w32_loop0: 4147 movu m16, [srcq+strideq*0] 4148 movu m17, [srcq+strideq*1] 4149 lea r7, [srcq+r6] 4150 movu m18, [srcq+strideq*2] 4151 movu m19, [r7 +strideq*0] 4152 mov r8, tmpq 4153 movu m20, [r7 +strideq*1] 4154 movu m21, [r7 +strideq*2] 4155 add r7, r6 4156 movu m22, [r7 +strideq*0] 4157 punpcklwd m0, m16, m17 ; 01l 4158 punpckhwd m16, m17 ; 01h 4159 punpcklwd m1, m17, m18 ; 12l 4160 punpckhwd m17, m18 ; 12h 4161 punpcklwd m2, m18, m19 ; 23l 4162 punpckhwd m18, m19 ; 23h 4163 punpcklwd m3, m19, m20 ; 34l 4164 punpckhwd m19, m20 ; 34h 4165 punpcklwd m4, m20, m21 ; 45l 4166 punpckhwd m20, m21 ; 45h 4167 punpcklwd m5, m21, m22 ; 56l 4168 punpckhwd m21, m22 ; 56h 4169.v_w32_loop: 4170 mova m6, m10 4171 vpdpwssd m6, m12, m0 ; a0l 4172 mova m8, m10 4173 vpdpwssd m8, m12, m16 ; a0h 4174 mova m7, m10 4175 vpdpwssd m7, m12, m1 ; b0l 4176 mova m9, m10 4177 vpdpwssd m9, m12, m17 ; b0h 4178 mova m0, m2 4179 vpdpwssd m6, m13, m2 ; a1l 4180 mova m16, m18 4181 vpdpwssd m8, m13, m18 ; a1h 4182 mova m1, m3 4183 vpdpwssd m7, m13, m3 ; b1l 4184 mova m17, m19 4185 vpdpwssd m9, m13, m19 ; b1h 4186 mova m2, m4 4187 vpdpwssd m6, m14, m4 ; a2l 4188 mova m18, m20 4189 vpdpwssd m8, m14, m20 ; a2h 4190 mova m3, m5 4191 vpdpwssd m7, m14, m5 ; b2l 4192 mova m19, m21 4193 vpdpwssd m9, m14, m21 ; b2h 4194 movu m21, [r7+strideq*1] 4195 lea r7, [r7+strideq*2] 4196 punpcklwd m4, m22, m21 ; 67l 4197 punpckhwd m20, m22, m21 ; 67h 4198 movu m22, [r7+strideq*0] 4199 vpdpwssd m6, m15, m4 ; a3l 4200 vpdpwssd m8, m15, m20 ; a3h 4201 punpcklwd m5, m21, m22 ; 78l 4202 punpckhwd m21, m22 ; 78h 4203 vpdpwssd m7, m15, m5 ; b3l 4204 vpdpwssd m9, m15, m21 ; b3h 4205 vpermt2b m6, m11, m8 4206 vpermt2b m7, m11, m9 4207 mova [r8+wq*0], m6 4208 mova [r8+wq*2], m7 4209 lea r8, [r8+wq*4] 4210 sub hd, 2 4211 jg .v_w32_loop 4212 add srcq, 64 4213 add tmpq, 64 4214 movzx hd, r5b 4215 sub r5d, 1<<8 4216 jg .v_w32_loop0 4217%if WIN64 4218 pop r8 4219%endif 4220 RET 4221.h_w4: 4222 RESET_STACK_STATE 4223 movzx mxd, mxb 4224 sub srcq, 2 4225 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 4226 mov r5d, r7m 4227 vbroadcasti32x4 m4, [spel_h_shufA] 4228 vbroadcasti32x4 m5, [spel_h_shufB] 4229 shr r5d, 11 4230 mova ym9, [prep_endA] 4231 psllw xmm0, [base+prep_hv_shift+r5*8] 4232 mova [tmpq], xmm0 4233 vpbroadcastd m6, [tmpq+4] 4234 vpbroadcastd m7, [tmpq+8] 4235.h_w4_loop: 4236 movu xm2, [srcq+strideq*0] 4237 vinserti32x4 ym2, [srcq+strideq*1], 1 4238 vinserti32x4 m2, [srcq+strideq*2], 2 4239 vinserti32x4 m2, [srcq+r6 ], 3 4240 lea srcq, [srcq+strideq*4] 4241 mova m0, m10 4242 pshufb m1, m2, m4 4243 vpdpwssd m0, m6, m1 4244 pshufb m2, m5 4245 vpdpwssd m0, m7, m2 4246 vpermb m0, m9, m0 4247 mova [tmpq], ym0 4248 add tmpq, 32 4249 sub hd, 4 4250 jg .h_w4_loop 4251 RET 4252.h_w8: 4253 mova m6, [spel_h_shufA] 4254 movu m7, [spel_h_shufB] 4255 movu m8, [spel_h_shufC] 4256 mova m9, [spel_h_shufD] 4257 mova m11, [prep_endB] 4258.h_w8_loop: 4259 movu ym4, [srcq+strideq*0] 4260 vinserti32x8 m4, [srcq+strideq*1], 1 4261 movu ym5, [srcq+strideq*2] 4262 vinserti32x8 m5, [srcq+r6 ], 1 4263 lea srcq, [srcq+strideq*4] 4264 mova m0, m10 4265 mova m1, m10 4266 vpermb m2, m6, m4 4267 vpermb m3, m6, m5 4268 vpdpwssd m0, m12, m2 4269 vpdpwssd m1, m12, m3 4270 vpermb m2, m7, m4 4271 vpermb m3, m7, m5 4272 vpdpwssd m0, m13, m2 4273 vpdpwssd m1, m13, m3 4274 vpermb m2, m8, m4 4275 vpermb m3, m8, m5 4276 vpdpwssd m0, m14, m2 4277 vpdpwssd m1, m14, m3 4278 vpermb m2, m9, m4 4279 vpermb m3, m9, m5 4280 vpdpwssd m0, m15, m2 4281 vpdpwssd m1, m15, m3 4282 vpermt2b m0, m11, m1 4283 mova [tmpq], m0 4284 add tmpq, 64 4285 sub hd, 4 4286 jg .h_w8_loop 4287 RET 4288.h: 4289 vpbroadcastd m10, [prep_8tap_rnd] 4290 test myd, 0xf00 4291 jnz .hv 4292 lea r6, [strideq*3] 4293 cmp wd, 4 4294 je .h_w4 4295 shr mxd, 16 4296 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 4297 mov r5d, r7m 4298 sub srcq, 6 4299 shr r5d, 11 4300 psllw xmm0, [base+prep_hv_shift+r5*8] 4301 mova [tmpq], xmm0 4302 vpbroadcastd m12, xmm0 4303 vpbroadcastd m13, [tmpq+ 4] 4304 vpbroadcastd m14, [tmpq+ 8] 4305 vpbroadcastd m15, [tmpq+12] 4306 cmp wd, 16 4307 jl .h_w8 4308 vbroadcasti32x4 m6, [spel_h_shufA] 4309 vbroadcasti32x4 m7, [spel_h_shufB] 4310 mova m11, [prep_endC] 4311 jg .h_w32 4312.h_w16_loop: 4313 movu ym2, [srcq+strideq*0+ 0] 4314 vinserti32x8 m2, [srcq+strideq*1+ 0], 1 4315 movu ym3, [srcq+strideq*0+16] 4316 vinserti32x8 m3, [srcq+strideq*1+16], 1 4317 lea srcq, [srcq+strideq*2] 4318 mova m0, m10 4319 mova m1, m10 4320 pshufb m4, m2, m6 4321 vpdpwssd m0, m12, m4 ; a0 4322 pshufb m4, m3, m6 4323 vpdpwssd m1, m14, m4 ; b2 4324 pshufb m4, m2, m7 4325 vpdpwssd m0, m13, m4 ; a1 4326 pshufb m4, m3, m7 4327 vpdpwssd m1, m15, m4 ; b3 4328 shufpd m2, m3, 0x55 4329 pshufb m4, m2, m6 4330 vpdpwssd m0, m14, m4 ; a2 4331 vpdpwssd m1, m12, m4 ; b0 4332 pshufb m2, m7 4333 vpdpwssd m0, m15, m2 ; a3 4334 vpdpwssd m1, m13, m2 ; b1 4335 vpermt2b m0, m11, m1 4336 mova [tmpq], m0 4337 add tmpq, 64 4338 sub hd, 2 4339 jg .h_w16_loop 4340 RET 4341.h_w32: 4342 lea srcq, [srcq+wq*2] 4343 neg wq 4344.h_w32_loop0: 4345 mov r6, wq 4346.h_w32_loop: 4347 movu m2, [srcq+r6*2+ 0] 4348 movu m3, [srcq+r6*2+ 8] 4349 mova m0, m10 4350 mova m1, m10 4351 pshufb m4, m2, m6 4352 vpdpwssd m0, m12, m4 ; a0 4353 pshufb m4, m3, m6 4354 vpdpwssd m1, m12, m4 ; b0 4355 vpdpwssd m0, m14, m4 ; a2 4356 movu m4, [srcq+r6*2+16] 4357 pshufb m3, m7 4358 vpdpwssd m1, m13, m3 ; b1 4359 vpdpwssd m0, m15, m3 ; a3 4360 pshufb m3, m4, m6 4361 vpdpwssd m1, m14, m3 ; b2 4362 pshufb m2, m7 4363 vpdpwssd m0, m13, m2 ; a1 4364 pshufb m4, m7 4365 vpdpwssd m1, m15, m4 ; b3 4366 vpermt2b m0, m11, m1 4367 mova [tmpq], m0 4368 add tmpq, 64 4369 add r6, 32 4370 jl .h_w32_loop 4371 add srcq, strideq 4372 dec hd 4373 jg .h_w32_loop0 4374 RET 4375.hv: 4376 vpbroadcastd m11, [pd_128] 4377 cmp wd, 4 4378 jg .hv_w8 4379 movzx mxd, mxb 4380 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 4381 movzx mxd, myb 4382 shr myd, 16 4383 cmp hd, 4 4384 cmove myd, mxd 4385 mov r5d, r7m 4386 pmovsxbw xmm1, [base+subpel_filters+myq*8] 4387 lea r6, [strideq*3] 4388 sub srcq, 2 4389 shr r5d, 11 4390 sub srcq, r6 4391 psllw xmm0, [base+prep_hv_shift+r5*8] 4392 psllw xmm1, 2 4393 mova [tmpq+ 0], xmm0 4394 mova [tmpq+16], xmm1 4395 vpbroadcastd m12, xmm1 4396 movu xm16, [srcq+strideq*0] 4397 mov r3d, 0xff0 4398 vinserti128 ym16, [srcq+strideq*1], 1 4399 kmovw k1, r3d 4400 vbroadcasti32x4 m18, [srcq+strideq*2] 4401 add srcq, r6 4402 vinserti64x2 m16{k1}, m18, [srcq+strideq*0], 3 4403 movu xm17, [srcq+strideq*1] 4404 vbroadcasti32x4 ym18, [srcq+strideq*2] 4405 add srcq, r6 4406 vinserti32x4 m17{k1}, m18, [srcq+strideq*0], 2 4407 vbroadcasti32x4 m5, [spel_h_shufA] 4408 vbroadcasti32x4 m6, [spel_h_shufB] 4409 vpbroadcastd m8, [tmpq+ 4] 4410 vpbroadcastd m9, [tmpq+ 8] 4411 mova m1, m10 4412 mova m19, [spel_shuf4a] 4413 mova m2, m10 4414 pshufb m0, m16, m5 4415 vpdpwssd m1, m8, m0 4416 pshufb m0, m17, m5 4417 vpdpwssd m2, m8, m0 4418 vpbroadcastd m13, [tmpq+20] 4419 pshufb m16, m6 4420 vpbroadcastd m14, [tmpq+24] 4421 pshufb m17, m6 4422 vpbroadcastd m15, [tmpq+28] 4423 vpdpwssd m1, m9, m16 ; 0 1 2 3 4424 vpdpwssd m2, m9, m17 ; 4 5 6 4425 mova m7, [spel_shuf4b] 4426 vpermt2b m1, m19, m2 ; 01 12 23 34 4427 vpermb m2, m19, m2 ; 45 56 4428 mova ym19, [prep_endA] 4429 vshufi32x4 m2, m1, m2, q1032 ; 23 34 45 56 4430.hv_w4_loop: 4431 movu xm17, [srcq+strideq*1] 4432 vinserti128 ym17, [srcq+strideq*2], 1 4433 vbroadcasti32x4 m16, [srcq+r6 ] 4434 lea srcq, [srcq+strideq*4] 4435 vinserti64x2 m17{k1}, m16, [srcq+strideq*0], 3 4436 mova m18, m10 4437 pshufb m16, m17, m5 4438 vpdpwssd m18, m8, m16 4439 mova m16, m11 4440 vpdpwssd m16, m12, m1 ; a0 b0 c0 d0 4441 pshufb m17, m6 4442 vpdpwssd m18, m9, m17 ; 7 8 9 a 4443 mova m1, m2 4444 vpdpwssd m16, m13, m2 ; a1 b1 c1 d1 4445 vpermt2b m2, m7, m18 ; 67 78 89 9a 4446 vpdpwssd m16, m15, m2 ; a3 b3 c3 d3 4447 vshufi32x4 m1, m2, q1032 ; 45 56 67 78 4448 vpdpwssd m16, m14, m1 ; a2 b2 c2 d2 4449 vpermb m16, m19, m16 4450 mova [tmpq], ym16 4451 add tmpq, 32 4452 sub hd, 4 4453 jg .hv_w4_loop 4454 vzeroupper 4455 RET 4456.hv_w8: 4457 shr mxd, 16 4458 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 4459 movzx mxd, myb 4460 shr myd, 16 4461 cmp hd, 6 4462 cmovs myd, mxd 4463 mov r5d, r7m 4464 pmovsxbw xmm1, [base+subpel_filters+myq*8] 4465 lea r6, [strideq*3] 4466 sub srcq, 6 4467 shr r5d, 11 4468 sub srcq, r6 4469 psllw xmm0, [base+prep_hv_shift+r5*8] 4470 psllw xmm1, 2 4471 mova [tmpq+ 0], xmm0 4472 mova [tmpq+16], xmm1 4473 vpbroadcastd m12, xmm0 4474 vpbroadcastd m13, [tmpq+ 4] 4475 vpbroadcastd m14, [tmpq+ 8] 4476 vpbroadcastd m15, [tmpq+12] 4477 vpbroadcastd m16, xmm1 4478 vpbroadcastd m17, [tmpq+20] 4479 vpbroadcastd m18, [tmpq+24] 4480 vpbroadcastd m19, [tmpq+28] 4481 cmp wd, 8 4482 jg .hv_w16 4483 WIN64_SPILL_XMM 23 4484 mova m5, [spel_h_shufA] 4485 movu ym0, [srcq+strideq*0] 4486 vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1 4487 movu ym9, [srcq+strideq*2] 4488 add srcq, r6 4489 vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3 4490 movu ym20, [srcq+strideq*1] 4491 vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5 4492 add srcq, r6 4493 movu ym21, [srcq+strideq*0] ; 6 4494 movu m6, [spel_h_shufB] 4495 movu m7, [spel_h_shufC] 4496 mova ym22, [prep_endB] 4497 vpermb m8, m5, m0 4498 mova m1, m10 4499 vpdpwssd m1, m12, m8 ; a0 b0 4500 vpermb m8, m5, m9 4501 mova m2, m10 4502 vpdpwssd m2, m12, m8 ; c0 d0 4503 vpermb m8, m5, m20 4504 mova m3, m10 4505 vpdpwssd m3, m12, m8 ; e0 f0 4506 vpermb m8, m5, m21 4507 mova m4, m10 4508 vpdpwssd m4, m12, m8 ; g0 4509 vpermb m8, m6, m0 4510 vpdpwssd m1, m13, m8 ; a1 b1 4511 vpermb m8, m6, m9 4512 vpdpwssd m2, m13, m8 ; c1 d1 4513 vpermb m8, m6, m20 4514 vpdpwssd m3, m13, m8 ; e1 f1 4515 vpermb m8, m6, m21 4516 vpdpwssd m4, m13, m8 ; g1 4517 vpermb m8, m7, m0 4518 vpdpwssd m1, m14, m8 ; a2 b2 4519 vpermb m8, m7, m9 4520 vpdpwssd m2, m14, m8 ; c2 d2 4521 vpermb m8, m7, m20 4522 vpdpwssd m3, m14, m8 ; e2 f2 4523 vpermb m8, m7, m21 4524 vpdpwssd m4, m14, m8 ; g2 4525 mova m8, [spel_h_shufD] 4526 vpermb m0, m8, m0 4527 vpdpwssd m1, m15, m0 ; a3 b3 4528 mova m0, [spel_shuf8a] 4529 vpermb m9, m8, m9 4530 vpdpwssd m2, m15, m9 ; c3 d3 4531 mova m9, [spel_shuf8b] 4532 vpermb m20, m8, m20 4533 vpdpwssd m3, m15, m20 ; e3 f3 4534 vpermb m21, m8, m21 4535 vpdpwssd m4, m15, m21 ; g3 4536 vpermt2b m1, m0, m2 ; 01 12 4537 vpermt2b m2, m0, m3 ; 23 34 4538 vpermt2b m3, m0, m4 ; 45 56 4539.hv_w8_loop: 4540 movu ym0, [srcq+strideq*1] 4541 lea srcq, [srcq+strideq*2] 4542 vinserti32x8 m0, [srcq+strideq*0], 1 4543 mova m4, m10 4544 mova m20, m11 4545 vpermb m21, m5, m0 4546 vpdpwssd m4, m12, m21 ; h0 i0 4547 vpermb m21, m6, m0 4548 vpdpwssd m20, m16, m1 ; A0 B0 4549 vpdpwssd m4, m13, m21 ; h1 i1 4550 vpermb m21, m7, m0 4551 mova m1, m2 4552 vpdpwssd m20, m17, m2 ; A1 B1 4553 vpdpwssd m4, m14, m21 ; h2 i2 4554 vpermb m21, m8, m0 4555 mova m2, m3 4556 vpdpwssd m20, m18, m3 ; A2 B2 4557 vpdpwssd m4, m15, m21 ; h3 i3 4558 vpermt2b m3, m9, m4 ; 67 78 4559 vpdpwssd m20, m19, m3 ; A3 B3 4560 vpermb m20, m22, m20 4561 mova [tmpq], ym20 4562 add tmpq, 32 4563 sub hd, 2 4564 jg .hv_w8_loop 4565 RET 4566.hv_w16: 4567 WIN64_SPILL_XMM 27 4568%if WIN64 4569 push r8 4570%endif 4571 vbroadcasti32x4 m20, [spel_h_shufA] 4572 vbroadcasti32x4 m21, [spel_h_shufB] 4573 add wd, wd 4574 mova m9, [spel_shuf16] 4575 mova m26, [prep_endB] 4576 lea r5d, [hq+wq*8-256] 4577.hv_w16_loop0: 4578 vbroadcasti32x8 m5, [srcq+strideq*0+ 8] 4579 vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0 4580 vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0 4581 movu ym6, [srcq+strideq*1+ 0] 4582 movu ym7, [srcq+strideq*1+16] 4583 lea r7, [srcq+r6] 4584 vinserti32x8 m6, [srcq+strideq*2+ 0], 1 4585 vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2 4586 movu ym22, [r7 +strideq*0+ 0] 4587 movu ym23, [r7 +strideq*0+16] 4588 mov r8, tmpq 4589 vinserti32x8 m22, [r7 +strideq*1+ 0], 1 4590 vinserti32x8 m23, [r7 +strideq*1+16], 1 ; 3 4 4591 movu ym24, [r7 +strideq*2+ 0] 4592 movu ym25, [r7 +strideq*2+16] 4593 add r7, r6 4594 vinserti32x8 m24, [r7 +strideq*0+ 0], 1 4595 vinserti32x8 m25, [r7 +strideq*0+16], 1 ; 5 6 4596 pshufb m0, m4, m20 4597 mova m1, m10 4598 vpdpwssd m1, m12, m0 ; a0 4599 pshufb m0, m6, m20 4600 mova m2, m10 4601 vpdpwssd m2, m12, m0 ; b0 4602 pshufb m0, m7, m20 4603 mova m3, m10 4604 vpdpwssd m3, m14, m0 ; c2 4605 pshufb m0, m4, m21 4606 vpdpwssd m1, m13, m0 ; a1 4607 pshufb m0, m6, m21 4608 vpdpwssd m2, m13, m0 ; b1 4609 pshufb m0, m7, m21 4610 vpdpwssd m3, m15, m0 ; c3 4611 pshufb m0, m5, m20 4612 vpdpwssd m1, m14, m0 ; a2 4613 shufpd m6, m7, 0x55 4614 pshufb m7, m6, m20 4615 vpdpwssd m2, m14, m7 ; b2 4616 vpdpwssd m3, m12, m7 ; c0 4617 pshufb m5, m21 4618 vpdpwssd m1, m15, m5 ; a3 4619 pshufb m6, m21 4620 vpdpwssd m2, m15, m6 ; b3 4621 vpdpwssd m3, m13, m6 ; c1 4622 pshufb m0, m22, m20 4623 mova m4, m10 4624 vpdpwssd m4, m12, m0 ; d0 4625 pshufb m0, m23, m20 4626 mova m5, m10 4627 vpdpwssd m5, m14, m0 ; e2 4628 pshufb m0, m24, m20 4629 mova m6, m10 4630 vpdpwssd m6, m12, m0 ; f0 4631 pshufb m0, m25, m20 4632 mova m7, m10 4633 vpdpwssd m7, m14, m0 ; g2 4634 pshufb m0, m22, m21 4635 vpdpwssd m4, m13, m0 ; d1 4636 pshufb m0, m23, m21 4637 vpdpwssd m5, m15, m0 ; e3 4638 pshufb m0, m24, m21 4639 vpdpwssd m6, m13, m0 ; f1 4640 pshufb m0, m25, m21 4641 vpdpwssd m7, m15, m0 ; g3 4642 shufpd m22, m23, 0x55 4643 pshufb m23, m22, m20 4644 vpdpwssd m4, m14, m23 ; d2 4645 vpdpwssd m5, m12, m23 ; e0 4646 shufpd m24, m25, 0x55 4647 pshufb m25, m24, m20 4648 vpdpwssd m6, m14, m25 ; f2 4649 vpdpwssd m7, m12, m25 ; g0 4650 pshufb m22, m21 4651 vpdpwssd m4, m15, m22 ; d3 4652 vpdpwssd m5, m13, m22 ; e1 4653 pshufb m24, m21 4654 vpdpwssd m6, m15, m24 ; f3 4655 vpdpwssd m7, m13, m24 ; g1 4656 pslldq m1, 1 4657 vpermt2b m2, m9, m3 ; 12 4658 vpermt2b m4, m9, m5 ; 34 4659 vpermt2b m6, m9, m7 ; 56 4660 vpshrdd m1, m2, 16 ; 01 4661 vpshrdd m3, m2, m4, 16 ; 23 4662 vpshrdd m5, m4, m6, 16 ; 45 4663.hv_w16_loop: 4664 movu ym24, [r7+strideq*1+ 0] 4665 movu ym25, [r7+strideq*1+16] 4666 lea r7, [r7+strideq*2] 4667 vinserti32x8 m24, [r7+strideq*0+ 0], 1 4668 vinserti32x8 m25, [r7+strideq*0+16], 1 4669 mova m7, m10 4670 mova m8, m10 4671 pshufb m0, m24, m20 4672 vpdpwssd m7, m12, m0 ; h0 4673 mova m22, m11 4674 pshufb m0, m25, m20 4675 vpdpwssd m8, m14, m0 ; i2 4676 mova m23, m11 4677 vpdpwssd m22, m16, m1 ; A0 4678 mova m1, m3 4679 vpdpwssd m23, m16, m2 ; B0 4680 mova m2, m4 4681 pshufb m0, m24, m21 4682 vpdpwssd m7, m13, m0 ; h1 4683 pshufb m0, m25, m21 4684 vpdpwssd m8, m15, m0 ; i3 4685 vpdpwssd m22, m17, m3 ; A1 4686 mova m3, m5 4687 vpdpwssd m23, m17, m4 ; B1 4688 mova m4, m6 4689 shufpd m24, m25, 0x55 4690 pshufb m25, m24, m20 4691 vpdpwssd m7, m14, m25 ; h2 4692 vpdpwssd m8, m12, m25 ; i0 4693 vpdpwssd m22, m18, m5 ; A2 4694 vpdpwssd m23, m18, m6 ; B2 4695 pshufb m24, m21 4696 vpdpwssd m7, m15, m24 ; h3 4697 vpdpwssd m8, m13, m24 ; i1 4698 vpermt2b m7, m9, m8 ; 78 4699 vpshrdd m5, m6, m7, 16 ; 67 4700 vpdpwssd m22, m19, m5 ; A3 4701 vpdpwssd m23, m19, m7 ; B3 4702 mova m6, m7 4703 vpermt2b m22, m26, m23 4704 mova [r8+wq*0], ym22 4705 vextracti32x8 [r8+wq*1], m22, 1 4706 lea r8, [r8+wq*2] 4707 sub hd, 2 4708 jg .hv_w16_loop 4709 add srcq, 32 4710 add tmpq, 32 4711 movzx hd, r5b 4712 sub r5d, 1<<8 4713 jg .hv_w16_loop0 4714%if WIN64 4715 pop r8 4716%endif 4717 RET 4718 4719%if WIN64 4720DECLARE_REG_TMP 5 4721%else 4722DECLARE_REG_TMP 7 4723%endif 4724 4725cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts 4726%define base r6-pd_0to7 4727 mov t0d, r7m 4728 lea r6, [pd_0to7] 4729 shr t0d, 11 4730 vpbroadcastd m8, [base+warp_8x8t_rnd_v] 4731 vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] 4732 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main 4733 psrad m14, m16, 15 4734 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 4735 psrad m16, 15 4736 packssdw m14, m16 4737 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 4738 psrad m15, m16, 15 4739 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 4740 add tsq, tsq 4741 psrad m16, 15 4742 packssdw m15, m16 4743 jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end 4744 4745cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd 4746 mov t0d, r7m ; pixel_max 4747 lea r6, [pd_0to7] 4748 shr t0d, 11 4749 vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] 4750 vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4] 4751 call .main 4752 psrad m14, m16, 13 4753 call .main2 4754 psrad m16, 13 4755 packusdw m14, m16 4756 call .main2 4757 psrad m15, m16, 13 4758 call .main2 4759 vpbroadcastd m0, [base+bidir_shift+t0*4] 4760 vpsrlvw m14, m0 4761 psrad m16, 13 4762 packusdw m15, m16 4763 vpsrlvw m15, m0 4764.end: 4765 mova m0, [base+warp8x8_end] 4766 vpermb m16, m0, m14 4767 lea r2, [dsq*3] 4768 mova [dstq+dsq*0], xm16 4769 vextracti128 [dstq+dsq*1], ym16, 1 4770 vextracti32x4 [dstq+dsq*2], m16, 2 4771 vextracti32x4 [dstq+r2 ], m16, 3 4772 vpermb m16, m0, m15 4773 lea dstq, [dstq+dsq*4] 4774 mova [dstq+dsq*0], xm16 4775 vextracti128 [dstq+dsq*1], ym16, 1 4776 vextracti32x4 [dstq+dsq*2], m16, 2 4777 vextracti32x4 [dstq+r2 ], m16, 3 4778 RET 4779.main: 4780 vpbroadcastd ym3, [base+pd_512] 4781%if WIN64 4782 mov abcdq, r5mp 4783 vpaddd ym18, ym3, r6m {1to8} ; mx 4784%else 4785 add r5d, 512 4786 vpbroadcastd ym18, r5d 4787%endif 4788 vpaddd ym20, ym3, r7m {1to8} ; my 4789 mova ym16, [base+pd_0to7] 4790 vpbroadcastd ym19, [abcdq+4*0] ; alpha 4791 vpbroadcastd ym21, [abcdq+4*1] ; gamma 4792 lea r4, [ssq*3+6] 4793 vpdpwssd ym18, ym19, ym16 ; tmx 4794 vpdpwssd ym20, ym21, ym16 ; tmy 4795 sub srcq, r4 4796 mova m10, [base+warp8x8_permA] 4797 lea r4, [mc_warp_filter+64*8] 4798 vbroadcasti32x4 m12, [base+warp8x8_permC] 4799 kxnorb k1, k1, k1 4800 vbroadcasti32x4 m13, [base+warp8x8_permD] 4801 movu ym5, [srcq+0] 4802 vinserti32x8 m5, [srcq+8], 1 4803 psrad ym17, ym18, 10 4804 mova m11, [base+warp8x8_permB] 4805 kmovb k2, k1 4806 vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0 4807 psrad ym19, 16 ; beta 4808 psrad ym21, 16 ; delta 4809 paddd ym18, ym19 4810 vpermb m4, m10, m5 4811 vpbroadcastq m9, [base+warp_shift_h+t0*8] 4812 pshufd m3, m3, q3120 4813 paddd m7, m1, m1 4814 pshufb m2, m3, m12 4815 vpdpwssd m1, m4, m2 4816 vpermb m5, m11, m5 4817 vshufi32x4 m4, m5, q1021 4818 pshufb m3, m13 4819 vpdpwssd m1, m4, m3 4820 call .h 4821 psllq m2, m1, 32 4822 paddd m1, m2 4823 vpmultishiftqb m1, m9, m1 4824 vpshrdq m1, m0, 48 ; 01 12 4825 call .h 4826 vpshrdq m2, m1, m0, 48 ; 23 34 4827 call .h 4828 vpshrdq m3, m2, m0, 48 ; 45 56 4829.main2: 4830 call .h 4831 psrad ym6, ym20, 10 4832 kmovb k1, k2 4833 paddd ym17, ym20, ym21 ; my += delta 4834 vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0 4835 psrad ym16, ym17, 10 4836 kmovb k2, k1 4837 vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1 4838 shufps m5, m20, m6, q2020 4839 mova m16, m8 4840 pshufb m4, m5, m12 4841 vpdpwssd m16, m1, m4 ; a0 b0 4842 pshufb m5, m13 4843 mova m1, m2 4844 vpdpwssd m16, m2, m5 ; a1 b1 4845 shufps m6, m20, m6, q3131 4846 paddd ym20, ym17, ym21 4847 pshufb m4, m6, m12 4848 mova m2, m3 4849 vpdpwssd m16, m3, m4 ; a2 b2 4850 vpshrdq m3, m0, 48 ; 67 78 4851 pshufb m6, m13 4852 vpdpwssd m16, m3, m6 ; a3 b3 4853 ret 4854ALIGN function_align 4855.h: 4856 movu ym16, [srcq+ssq*1] 4857 psrad ym6, ym18, 10 4858 lea srcq, [srcq+ssq*2] 4859 vinserti32x8 m5, m16, [srcq+ssq*0], 1 4860 kmovb k1, k2 4861 paddd ym17, ym18, ym19 ; mx += beta 4862 vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1 4863 psrad ym16, ym17, 10 4864 kmovb k2, k1 4865 vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2 4866 vpermb m4, m10, m5 4867 shufps m16, m18, m6, q2020 4868 shufps m6, m18, m6, q3131 4869 mova m0, m7 4870 pshufb m18, m16, m12 4871 vpdpwssd m0, m4, m18 ; a0 b0 4872 vpermb m5, m11, m5 4873 pshufb m18, m6, m13 4874 vpdpwssd m0, m5, m18 ; a3 b3 4875 paddd ym18, ym17, ym19 4876 vshufi32x4 m17, m4, m5, q1021 4877 pshufb m16, m13 4878 vpdpwssd m0, m17, m16 ; a1 b1 4879 vshufi32x4 m4, m5, q2132 4880 pshufb m6, m12 4881 vpdpwssd m0, m4, m6 ; a2 b2 4882 vpmultishiftqb m0, m9, m0 ; a a b b 4883 ret 4884 4885%macro BIDIR_FN 0 4886 call .main 4887 lea stride3q, [strideq*3] 4888 jmp wq 4889.w4: 4890 movq [dstq ], xm0 4891 movhps [dstq+strideq*1], xm0 4892 vextracti32x4 xm2, ym0, 1 4893 movq [dstq+strideq*2], xm2 4894 movhps [dstq+stride3q ], xm2 4895 cmp hd, 8 4896 jl .w4_end 4897 vextracti32x4 xm2, m0, 2 4898 lea dstq, [dstq+strideq*4] 4899 movq [dstq ], xm2 4900 movhps [dstq+strideq*1], xm2 4901 vextracti32x4 xm0, m0, 3 4902 movq [dstq+strideq*2], xm0 4903 movhps [dstq+stride3q ], xm0 4904 je .w4_end 4905 lea dstq, [dstq+strideq*4] 4906 movq [dstq ], xm1 4907 movhps [dstq+strideq*1], xm1 4908 vextracti32x4 xm0, ym1, 1 4909 movq [dstq+strideq*2], xm0 4910 movhps [dstq+stride3q ], xm0 4911 vextracti32x4 xm0, m1, 2 4912 lea dstq, [dstq+strideq*4] 4913 movq [dstq ], xm0 4914 movhps [dstq+strideq*1], xm0 4915 vextracti32x4 xm1, m1, 3 4916 movq [dstq+strideq*2], xm1 4917 movhps [dstq+stride3q ], xm1 4918.w4_end: 4919 RET 4920.w8_loop: 4921 call .main 4922 lea dstq, [dstq+strideq*4] 4923.w8: 4924 mova [dstq+strideq*0], xm0 4925 vextracti32x4 [dstq+strideq*1], ym0, 1 4926 vextracti32x4 [dstq+strideq*2], m0, 2 4927 vextracti32x4 [dstq+stride3q ], m0, 3 4928 sub hd, 8 4929 jl .w8_end 4930 lea dstq, [dstq+strideq*4] 4931 mova [dstq+strideq*0], xm1 4932 vextracti32x4 [dstq+strideq*1], ym1, 1 4933 vextracti32x4 [dstq+strideq*2], m1, 2 4934 vextracti32x4 [dstq+stride3q ], m1, 3 4935 jg .w8_loop 4936.w8_end: 4937 RET 4938.w16_loop: 4939 call .main 4940 lea dstq, [dstq+strideq*4] 4941.w16: 4942 mova [dstq+strideq*0], ym0 4943 vextracti32x8 [dstq+strideq*1], m0, 1 4944 mova [dstq+strideq*2], ym1 4945 vextracti32x8 [dstq+stride3q ], m1, 1 4946 sub hd, 4 4947 jg .w16_loop 4948 RET 4949.w32_loop: 4950 call .main 4951 lea dstq, [dstq+strideq*2] 4952.w32: 4953 mova [dstq+strideq*0], m0 4954 mova [dstq+strideq*1], m1 4955 sub hd, 2 4956 jg .w32_loop 4957 RET 4958.w64_loop: 4959 call .main 4960 add dstq, strideq 4961.w64: 4962 mova [dstq+64*0], m0 4963 mova [dstq+64*1], m1 4964 dec hd 4965 jg .w64_loop 4966 RET 4967.w128_loop: 4968 call .main 4969 add dstq, strideq 4970.w128: 4971 mova [dstq+64*0], m0 4972 mova [dstq+64*1], m1 4973 call .main 4974 mova [dstq+64*2], m0 4975 mova [dstq+64*3], m1 4976 dec hd 4977 jg .w128_loop 4978 RET 4979%endmacro 4980 4981%if WIN64 4982DECLARE_REG_TMP 5 4983%else 4984DECLARE_REG_TMP 7 4985%endif 4986 4987cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3 4988%define base r6-avg_avx512icl_table 4989 lea r6, [avg_avx512icl_table] 4990 tzcnt wd, wm 4991 mov t0d, r6m ; pixel_max 4992 movsxd wq, [r6+wq*4] 4993 shr t0d, 11 4994 vpbroadcastd m2, [base+avg_round+t0*4] 4995 vpbroadcastd m3, [base+avg_shift+t0*4] 4996 movifnidn hd, hm 4997 add wq, r6 4998 BIDIR_FN 4999ALIGN function_align 5000.main: 5001 mova m0, [tmp1q+64*0] 5002 paddsw m0, [tmp2q+64*0] 5003 mova m1, [tmp1q+64*1] 5004 paddsw m1, [tmp2q+64*1] 5005 add tmp1q, 64*2 5006 add tmp2q, 64*2 5007 pmaxsw m0, m2 5008 pmaxsw m1, m2 5009 psubsw m0, m2 5010 psubsw m1, m2 5011 vpsrlvw m0, m3 5012 vpsrlvw m1, m3 5013 ret 5014 5015cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3 5016%define base r6-w_avg_avx512icl_table 5017 lea r6, [w_avg_avx512icl_table] 5018 tzcnt wd, wm 5019 mov t0d, r7m ; pixel_max 5020 shr t0d, 11 5021 movsxd wq, [r6+wq*4] 5022 vpbroadcastd m5, [base+w_avg_round+t0*4] 5023 vpbroadcastd m7, [base+bidir_shift+t0*4] 5024 add wq, r6 5025 mov r6d, r6m ; weight 5026 lea t0d, [r6-16] 5027 shl r6d, 16 5028 sub r6d, t0d ; 16-weight, weight 5029 movifnidn hd, hm 5030 vpbroadcastd m6, r6d 5031 BIDIR_FN 5032ALIGN function_align 5033.main: 5034 mova m3, [tmp1q+64*0] 5035 mova m1, [tmp2q+64*0] 5036 mova m0, [tmp1q+64*1] 5037 mova m4, [tmp2q+64*1] 5038 add tmp1q, 64*2 5039 add tmp2q, 64*2 5040 punpcklwd m2, m1, m3 5041 punpckhwd m1, m3 5042 punpcklwd m3, m4, m0 5043 punpckhwd m4, m0 5044 mova m0, m5 5045 vpdpwssd m0, m6, m2 5046 mova m2, m5 5047 vpdpwssd m2, m6, m1 5048 mova m1, m5 5049 vpdpwssd m1, m6, m3 5050 mova m3, m5 5051 vpdpwssd m3, m6, m4 5052 REPX {psrad x, 2}, m0, m2, m1, m3 5053 packusdw m0, m2 5054 packusdw m1, m3 5055 vpsrlvw m0, m7 5056 vpsrlvw m1, m7 5057 ret 5058 5059cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 5060%define base r7-mask_avx512icl_table 5061 lea r7, [mask_avx512icl_table] 5062 tzcnt wd, wm 5063 mov r6d, r7m ; pixel_max 5064 movifnidn hd, hm 5065 shr r6d, 11 5066 movsxd wq, [r7+wq*4] 5067 vpbroadcastd m8, [base+pw_64] 5068 vpbroadcastd m9, [base+mask_round+r6*4] 5069 vpbroadcastd m10, [base+bidir_shift+r6*4] 5070 mov maskq, maskmp 5071 add wq, r7 5072 BIDIR_FN 5073ALIGN function_align 5074.main: 5075 pmovzxbw m1, [maskq+32*0] 5076 mova m4, [tmp1q+64*0] 5077 mova m2, [tmp2q+64*0] 5078 pmovzxbw m6, [maskq+32*1] 5079 mova m5, [tmp1q+64*1] 5080 mova m3, [tmp2q+64*1] 5081 add maskq, 32*2 5082 add tmp1q, 64*2 5083 add tmp2q, 64*2 5084 punpcklwd m7, m4, m2 5085 punpckhwd m4, m2 5086 psubw m0, m8, m1 5087 punpcklwd m2, m1, m0 ; m, 64-m 5088 punpckhwd m1, m0 5089 mova m0, m9 5090 vpdpwssd m0, m7, m2 5091 mova m2, m9 5092 vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m) 5093 punpcklwd m7, m5, m3 5094 punpckhwd m5, m3 5095 psubw m1, m8, m6 5096 punpcklwd m3, m6, m1 5097 punpckhwd m6, m1 5098 mova m1, m9 5099 vpdpwssd m1, m7, m3 5100 mova m3, m9 5101 vpdpwssd m3, m5, m6 5102 REPX {psrad x, 4}, m0, m2, m1, m3 5103 packusdw m0, m2 5104 packusdw m1, m3 5105 vpsrlvw m0, m10 5106 vpsrlvw m1, m10 5107 ret 5108 5109cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 5110%define base r7-w_mask_420_avx512icl_table 5111 lea r7, [w_mask_420_avx512icl_table] 5112 tzcnt wd, wm 5113 mov r6d, r8m ; pixel_max 5114 movifnidn hd, hm 5115 shr r6d, 11 5116 movsxd wq, [r7+wq*4] 5117 vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 5118 vpbroadcastd m11, [base+pw_64] 5119 vpbroadcastd m12, [base+mask_round+r6*4] 5120 vpbroadcastd m13, [base+bidir_shift+r6*4] 5121 mov r6d, r7m ; sign 5122 vpbroadcastd m14, [base+w_mask_round+r6*4] 5123 mova ym15, [w_mask_end42x] 5124 mov maskq, maskmp 5125 add wq, r7 5126 call .main 5127 lea stride3q, [strideq*3] 5128 jmp wq 5129.w4: 5130 mova m4, [w_mask_shuf4] 5131 vpermt2b m2, m4, m3 5132 mova m3, m14 5133 vpdpbusd m3, m2, [pb_64] {1to16} 5134 vpermb m3, m15, m3 5135 movq [dstq+strideq*0], xm0 5136 movhps [dstq+strideq*1], xm0 5137 vextracti32x4 xm2, ym0, 1 5138 movq [dstq+strideq*2], xm2 5139 movhps [dstq+stride3q ], xm2 5140 mova [maskq], xm3 5141 cmp hd, 8 5142 jl .w4_end 5143 vextracti32x4 xm2, m0, 2 5144 lea dstq, [dstq+strideq*4] 5145 movq [dstq+strideq*0], xm2 5146 movhps [dstq+strideq*1], xm2 5147 vextracti32x4 xm0, m0, 3 5148 movq [dstq+strideq*2], xm0 5149 movhps [dstq+stride3q ], xm0 5150 je .w4_end 5151 lea dstq, [dstq+strideq*4] 5152 movq [dstq+strideq*0], xm1 5153 movhps [dstq+strideq*1], xm1 5154 vextracti32x4 xm2, ym1, 1 5155 movq [dstq+strideq*2], xm2 5156 movhps [dstq+stride3q ], xm2 5157 vextracti32x4 xm2, m1, 2 5158 lea dstq, [dstq+strideq*4] 5159 movq [dstq+strideq*0], xm2 5160 movhps [dstq+strideq*1], xm2 5161 vextracti32x4 xm1, m1, 3 5162 movq [dstq+strideq*2], xm1 5163 movhps [dstq+stride3q ], xm1 5164.w4_end: 5165 RET 5166.w8: 5167 mova m8, [w_mask_shuf8] 5168 vpbroadcastd m9, [pb_64] 5169 jmp .w8_start 5170.w8_loop: 5171 call .main 5172 lea dstq, [dstq+strideq*4] 5173 add maskq, 16 5174.w8_start: 5175 vpermt2b m2, m8, m3 5176 mova m3, m14 5177 vpdpbusd m3, m2, m9 5178 vpermb m3, m15, m3 5179 mova [dstq+strideq*0], xm0 5180 vextracti32x4 [dstq+strideq*1], ym0, 1 5181 vextracti32x4 [dstq+strideq*2], m0, 2 5182 vextracti32x4 [dstq+stride3q ], m0, 3 5183 mova [maskq], xm3 5184 sub hd, 8 5185 jl .w8_end 5186 lea dstq, [dstq+strideq*4] 5187 mova [dstq+strideq*0], xm1 5188 vextracti32x4 [dstq+strideq*1], ym1, 1 5189 vextracti32x4 [dstq+strideq*2], m1, 2 5190 vextracti32x4 [dstq+stride3q ], m1, 3 5191 jg .w8_loop 5192.w8_end: 5193 RET 5194.w16: 5195 mova m8, [w_mask_shuf16] 5196 vpbroadcastd m9, [pb_64] 5197 jmp .w16_start 5198.w16_loop: 5199 call .main 5200 lea dstq, [dstq+strideq*4] 5201 add maskq, 16 5202.w16_start: 5203 vpermt2b m2, m8, m3 5204 mova m3, m14 5205 vpdpbusd m3, m2, m9 5206 vpermb m3, m15, m3 5207 mova [dstq+strideq*0], ym0 5208 vextracti32x8 [dstq+strideq*1], m0, 1 5209 mova [dstq+strideq*2], ym1 5210 vextracti32x8 [dstq+stride3q ], m1, 1 5211 mova [maskq], xm3 5212 sub hd, 4 5213 jg .w16_loop 5214 RET 5215.w32_loop: 5216 call .main 5217 lea dstq, [dstq+strideq*4] 5218 add maskq, 32 5219.w32: 5220 paddw m2, m3 5221 mova m8, m14 5222 vpdpwssd m8, m11, m2 5223 mova [dstq+strideq*0], m0 5224 mova [dstq+strideq*1], m1 5225 call .main 5226 paddw m2, m3 5227 mova m3, m14 5228 vpdpwssd m3, m11, m2 5229 vpermt2b m8, m15, m3 5230 mova [dstq+strideq*2], m0 5231 mova [dstq+stride3q ], m1 5232 mova [maskq], ym8 5233 sub hd, 4 5234 jg .w32_loop 5235 RET 5236.w64_loop: 5237 call .main 5238 lea dstq, [dstq+strideq*2] 5239 add maskq, 32 5240.w64: 5241 mova m8, m2 5242 mova m9, m3 5243 mova [dstq+strideq*0+64*0], m0 5244 mova [dstq+strideq*0+64*1], m1 5245 call .main 5246 paddw m8, m2 5247 paddw m9, m3 5248 mova m2, m14 5249 vpdpwssd m2, m11, m8 5250 mova m3, m14 5251 vpdpwssd m3, m11, m9 5252 vpermt2b m2, m15, m3 5253 mova [dstq+strideq*1+64*0], m0 5254 mova [dstq+strideq*1+64*1], m1 5255 mova [maskq], ym2 5256 sub hd, 2 5257 jg .w64_loop 5258 RET 5259.w128_loop: 5260 call .main 5261 lea dstq, [dstq+strideq*2] 5262 add maskq, 64 5263.w128: 5264 mova m16, m2 5265 mova m8, m3 5266 mova [dstq+strideq*0+64*0], m0 5267 mova [dstq+strideq*0+64*1], m1 5268 call .main 5269 mova m17, m2 5270 mova m9, m3 5271 mova [dstq+strideq*0+64*2], m0 5272 mova [dstq+strideq*0+64*3], m1 5273 call .main 5274 paddw m2, m16 5275 paddw m3, m8 5276 mova m16, m14 5277 vpdpwssd m16, m11, m2 5278 mova m8, m14 5279 vpdpwssd m8, m11, m3 5280 mova [dstq+strideq*1+64*0], m0 5281 mova [dstq+strideq*1+64*1], m1 5282 call .main 5283 paddw m2, m17 5284 paddw m3, m9 5285 mova m17, m14 5286 vpdpwssd m17, m11, m2 5287 mova m9, m14 5288 vpdpwssd m9, m11, m3 5289 vpermt2b m16, m15, m8 5290 vpermt2b m17, m15, m9 5291 mova [dstq+strideq*1+64*2], m0 5292 mova [dstq+strideq*1+64*3], m1 5293 mova [maskq+32*0], ym16 5294 mova [maskq+32*1], ym17 5295 sub hd, 2 5296 jg .w128_loop 5297 vzeroupper 5298 RET 5299ALIGN function_align 5300.main: 5301 mova m1, [tmp1q+64*0] 5302 mova m3, [tmp2q+64*0] 5303 mova m4, [tmp1q+64*1] 5304 mova m7, [tmp2q+64*1] 5305 add tmp1q, 64*2 5306 add tmp2q, 64*2 5307 psubsw m6, m1, m3 5308 punpcklwd m5, m3, m1 5309 pabsw m6, m6 5310 punpckhwd m3, m1 5311 psubusw m6, m10, m6 5312 psrlw m6, 10 ; 64-m 5313 psubw m2, m11, m6 ; m 5314 punpcklwd m1, m6, m2 5315 punpckhwd m6, m2 5316 mova m0, m12 5317 vpdpwssd m0, m5, m1 5318 mova m1, m12 5319 vpdpwssd m1, m3, m6 5320 psubsw m5, m4, m7 5321 punpcklwd m6, m7, m4 5322 pabsw m5, m5 5323 punpckhwd m7, m4 5324 psubusw m5, m10, m5 5325 psrlw m5, 10 5326 psubw m3, m11, m5 5327 punpcklwd m4, m5, m3 5328 psrad m0, 4 5329 punpckhwd m5, m3 5330 psrad m1, 4 5331 packusdw m0, m1 5332 mova m1, m12 5333 vpdpwssd m1, m6, m4 5334 mova m4, m12 5335 vpdpwssd m4, m7, m5 5336 psrad m1, 4 5337 psrad m4, 4 5338 packusdw m1, m4 5339 vpsrlvw m0, m13 5340 vpsrlvw m1, m13 5341 ret 5342 5343cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 5344%define base r7-w_mask_422_avx512icl_table 5345 lea r7, [w_mask_422_avx512icl_table] 5346 tzcnt wd, wm 5347 mov r6d, r8m ; pixel_max 5348 movifnidn hd, hm 5349 shr r6d, 11 5350 movsxd wq, [r7+wq*4] 5351 vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 5352 vpbroadcastd m9, [base+pw_64] 5353 vpbroadcastd m10, [base+mask_round+r6*4] 5354 vpbroadcastd m11, [base+bidir_shift+r6*4] 5355 mov r6d, r7m ; sign 5356 vpbroadcastd m12, [base+w_mask_round+r6*4] 5357 mova ym13, [w_mask_end42x] 5358 mov maskq, maskmp 5359 add wq, r7 5360 paddw m14, m9, m9 ; pw_128 5361 call .main 5362 lea stride3q, [strideq*3] 5363 jmp wq 5364.w4: 5365 movq [dstq+strideq*0], xm0 5366 movhps [dstq+strideq*1], xm0 5367 vextracti32x4 xm2, ym0, 1 5368 movq [dstq+strideq*2], xm2 5369 movhps [dstq+stride3q ], xm2 5370 cmp hd, 8 5371 jl .w4_end 5372 vextracti32x4 xm2, m0, 2 5373 lea dstq, [dstq+strideq*4] 5374 movq [dstq+strideq*0], xm2 5375 movhps [dstq+strideq*1], xm2 5376 vextracti32x4 xm0, m0, 3 5377 movq [dstq+strideq*2], xm0 5378 movhps [dstq+stride3q ], xm0 5379 je .w4_end 5380 lea dstq, [dstq+strideq*4] 5381 movq [dstq+strideq*0], xm1 5382 movhps [dstq+strideq*1], xm1 5383 vextracti32x4 xm2, ym1, 1 5384 movq [dstq+strideq*2], xm2 5385 movhps [dstq+stride3q ], xm2 5386 vextracti32x4 xm2, m1, 2 5387 lea dstq, [dstq+strideq*4] 5388 movq [dstq+strideq*0], xm2 5389 movhps [dstq+strideq*1], xm2 5390 vextracti32x4 xm1, m1, 3 5391 movq [dstq+strideq*2], xm1 5392 movhps [dstq+stride3q ], xm1 5393.w4_end: 5394 RET 5395.w8_loop: 5396 call .main 5397 lea dstq, [dstq+strideq*4] 5398.w8: 5399 mova [dstq+strideq*0], xm0 5400 vextracti32x4 [dstq+strideq*1], ym0, 1 5401 vextracti32x4 [dstq+strideq*2], m0, 2 5402 vextracti32x4 [dstq+stride3q ], m0, 3 5403 sub hd, 8 5404 jl .w8_end 5405 lea dstq, [dstq+strideq*4] 5406 mova [dstq+strideq*0], xm1 5407 vextracti32x4 [dstq+strideq*1], ym1, 1 5408 vextracti32x4 [dstq+strideq*2], m1, 2 5409 vextracti32x4 [dstq+stride3q ], m1, 3 5410 jg .w8_loop 5411.w8_end: 5412 RET 5413.w16_loop: 5414 call .main 5415 lea dstq, [dstq+strideq*4] 5416.w16: 5417 mova [dstq+strideq*0], ym0 5418 vextracti32x8 [dstq+strideq*1], m0, 1 5419 mova [dstq+strideq*2], ym1 5420 vextracti32x8 [dstq+stride3q ], m1, 1 5421 sub hd, 4 5422 jg .w16_loop 5423 RET 5424.w32_loop: 5425 call .main 5426 lea dstq, [dstq+strideq*2] 5427.w32: 5428 mova [dstq+strideq*0], m0 5429 mova [dstq+strideq*1], m1 5430 sub hd, 2 5431 jg .w32_loop 5432 RET 5433.w64_loop: 5434 call .main 5435 add dstq, strideq 5436.w64: 5437 mova [dstq+64*0], m0 5438 mova [dstq+64*1], m1 5439 dec hd 5440 jg .w64_loop 5441 RET 5442.w128_loop: 5443 call .main 5444 add dstq, strideq 5445.w128: 5446 mova [dstq+64*0], m0 5447 mova [dstq+64*1], m1 5448 call .main 5449 mova [dstq+64*2], m0 5450 mova [dstq+64*3], m1 5451 dec hd 5452 jg .w128_loop 5453 RET 5454ALIGN function_align 5455.main: 5456 mova m1, [tmp1q+64*0] 5457 mova m3, [tmp2q+64*0] 5458 mova m4, [tmp1q+64*1] 5459 mova m7, [tmp2q+64*1] 5460 add tmp1q, 64*2 5461 add tmp2q, 64*2 5462 psubsw m6, m1, m3 5463 punpcklwd m5, m3, m1 5464 pabsw m6, m6 5465 punpckhwd m3, m1 5466 psubusw m6, m8, m6 5467 psrlw m6, 10 5468 psubw m2, m9, m6 5469 punpcklwd m1, m6, m2 5470 punpckhwd m6, m2 5471 mova m0, m10 5472 vpdpwssd m0, m5, m1 5473 mova m1, m10 5474 vpdpwssd m1, m3, m6 5475 psubsw m5, m4, m7 5476 punpcklwd m6, m7, m4 5477 pabsw m5, m5 5478 punpckhwd m7, m4 5479 psubusw m5, m8, m5 5480 psrlw m5, 10 5481 psubw m3, m9, m5 5482 punpcklwd m4, m5, m3 5483 psrad m0, 4 5484 punpckhwd m5, m3 5485 psrad m1, 4 5486 packusdw m0, m1 5487 mova m1, m10 5488 vpdpwssd m1, m6, m4 5489 mova m4, m10 5490 vpdpwssd m4, m7, m5 5491 mova m5, m12 5492 vpdpwssd m5, m14, m2 5493 mova m2, m12 5494 vpdpwssd m2, m14, m3 5495 psrad m1, 4 5496 psrad m4, 4 5497 packusdw m1, m4 5498 vpermt2b m5, m13, m2 5499 vpsrlvw m0, m11 5500 vpsrlvw m1, m11 5501 mova [maskq], ym5 5502 add maskq, 32 5503 ret 5504 5505cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3 5506%define base r7-w_mask_444_avx512icl_table 5507 lea r7, [w_mask_444_avx512icl_table] 5508 tzcnt wd, wm 5509 mov r6d, r8m ; pixel_max 5510 movifnidn hd, hm 5511 shr r6d, 11 5512 movsxd wq, [r7+wq*4] 5513 vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 5514 vpbroadcastd m9, [base+pw_64] 5515 vpbroadcastd m10, [base+mask_round+r6*4] 5516 mova m11, [w_mask_end444] 5517 vpbroadcastd m12, [base+bidir_shift+r6*4] 5518 mov maskq, maskmp 5519 add wq, r7 5520 call .main 5521 lea stride3q, [strideq*3] 5522 jmp wq 5523.w4: 5524 movq [dstq+strideq*0], xm0 5525 movhps [dstq+strideq*1], xm0 5526 vextracti32x4 xm2, ym0, 1 5527 movq [dstq+strideq*2], xm2 5528 movhps [dstq+stride3q ], xm2 5529 cmp hd, 8 5530 jl .w4_end 5531 vextracti32x4 xm2, m0, 2 5532 lea dstq, [dstq+strideq*4] 5533 movq [dstq+strideq*0], xm2 5534 movhps [dstq+strideq*1], xm2 5535 vextracti32x4 xm0, m0, 3 5536 movq [dstq+strideq*2], xm0 5537 movhps [dstq+stride3q ], xm0 5538 je .w4_end 5539 lea dstq, [dstq+strideq*4] 5540 movq [dstq+strideq*0], xm1 5541 movhps [dstq+strideq*1], xm1 5542 vextracti32x4 xm2, ym1, 1 5543 movq [dstq+strideq*2], xm2 5544 movhps [dstq+stride3q ], xm2 5545 vextracti32x4 xm2, m1, 2 5546 lea dstq, [dstq+strideq*4] 5547 movq [dstq+strideq*0], xm2 5548 movhps [dstq+strideq*1], xm2 5549 vextracti32x4 xm1, m1, 3 5550 movq [dstq+strideq*2], xm1 5551 movhps [dstq+stride3q ], xm1 5552.w4_end: 5553 RET 5554.w8_loop: 5555 call .main 5556 lea dstq, [dstq+strideq*4] 5557.w8: 5558 mova [dstq+strideq*0], xm0 5559 vextracti32x4 [dstq+strideq*1], ym0, 1 5560 vextracti32x4 [dstq+strideq*2], m0, 2 5561 vextracti32x4 [dstq+stride3q ], m0, 3 5562 sub hd, 8 5563 jl .w8_end 5564 lea dstq, [dstq+strideq*4] 5565 mova [dstq+strideq*0], xm1 5566 vextracti32x4 [dstq+strideq*1], ym1, 1 5567 vextracti32x4 [dstq+strideq*2], m1, 2 5568 vextracti32x4 [dstq+stride3q ], m1, 3 5569 jg .w8_loop 5570.w8_end: 5571 RET 5572.w16_loop: 5573 call .main 5574 lea dstq, [dstq+strideq*4] 5575.w16: 5576 mova [dstq+strideq*0], ym0 5577 vextracti32x8 [dstq+strideq*1], m0, 1 5578 mova [dstq+strideq*2], ym1 5579 vextracti32x8 [dstq+stride3q ], m1, 1 5580 sub hd, 4 5581 jg .w16_loop 5582 RET 5583.w32_loop: 5584 call .main 5585 lea dstq, [dstq+strideq*2] 5586.w32: 5587 mova [dstq+strideq*0], m0 5588 mova [dstq+strideq*1], m1 5589 sub hd, 2 5590 jg .w32_loop 5591 RET 5592.w64_loop: 5593 call .main 5594 add dstq, strideq 5595.w64: 5596 mova [dstq+64*0], m0 5597 mova [dstq+64*1], m1 5598 dec hd 5599 jg .w64_loop 5600 RET 5601.w128_loop: 5602 call .main 5603 add dstq, strideq 5604.w128: 5605 mova [dstq+64*0], m0 5606 mova [dstq+64*1], m1 5607 call .main 5608 mova [dstq+64*2], m0 5609 mova [dstq+64*3], m1 5610 dec hd 5611 jg .w128_loop 5612 RET 5613ALIGN function_align 5614.main: 5615 mova m1, [tmp1q+64*0] 5616 mova m3, [tmp2q+64*0] 5617 mova m4, [tmp1q+64*1] 5618 mova m7, [tmp2q+64*1] 5619 add tmp1q, 64*2 5620 add tmp2q, 64*2 5621 psubsw m6, m1, m3 5622 punpcklwd m5, m3, m1 5623 pabsw m6, m6 5624 punpckhwd m3, m1 5625 psubusw m6, m8, m6 5626 psrlw m6, 10 5627 psubw m2, m9, m6 5628 punpcklwd m1, m6, m2 5629 punpckhwd m6, m2 5630 mova m0, m10 5631 vpdpwssd m0, m5, m1 5632 mova m1, m10 5633 vpdpwssd m1, m3, m6 5634 psubsw m5, m4, m7 5635 punpcklwd m6, m7, m4 5636 pabsw m5, m5 5637 punpckhwd m7, m4 5638 psubusw m5, m8, m5 5639 psrlw m5, 10 5640 psubw m3, m9, m5 5641 punpcklwd m4, m5, m3 5642 psrad m0, 4 5643 punpckhwd m5, m3 5644 psrad m1, 4 5645 packusdw m0, m1 5646 mova m1, m10 5647 vpdpwssd m1, m6, m4 5648 mova m4, m10 5649 vpdpwssd m4, m7, m5 5650 vpermt2b m2, m11, m3 5651 psrad m1, 4 5652 psrad m4, 4 5653 packusdw m1, m4 5654 vpsrlvw m0, m12 5655 vpsrlvw m1, m12 5656 mova [maskq], m2 5657 add maskq, 64 5658 ret 5659 5660cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask 5661%define base r6-blend_avx512icl_table 5662 lea r6, [blend_avx512icl_table] 5663 tzcnt wd, wm 5664 movifnidn hd, hm 5665 movsxd wq, [r6+wq*4] 5666 movifnidn maskq, maskmp 5667 vpbroadcastd m6, [base+pw_m512] 5668 add wq, r6 5669 lea r6, [dsq*3] 5670 jmp wq 5671.w4: 5672 pmovzxbw ym19, [maskq] 5673 movq xm16, [dstq+dsq*0] 5674 movhps xm16, [dstq+dsq*1] 5675 vpbroadcastq ym17, [dstq+dsq*2] 5676 vpbroadcastq ym18, [dstq+r6 ] 5677 pmullw ym19, ym6 5678 vpblendd ym16, ym17, 0x30 5679 vpblendd ym16, ym18, 0xc0 5680 psubw ym17, ym16, [tmpq] 5681 add maskq, 16 5682 add tmpq, 32 5683 pmulhrsw ym17, ym19 5684 paddw ym16, ym17 5685 vextracti128 xm17, ym16, 1 5686 movq [dstq+dsq*0], xm16 5687 movhps [dstq+dsq*1], xm16 5688 movq [dstq+dsq*2], xm17 5689 movhps [dstq+r6 ], xm17 5690 lea dstq, [dstq+dsq*4] 5691 sub hd, 4 5692 jg .w4 5693 vzeroupper 5694 RET 5695.w8: 5696 pmovzxbw m2, [maskq] 5697 mova xm0, [dstq+dsq*0] 5698 vinserti32x4 ym0, [dstq+dsq*1], 1 5699 vinserti32x4 m0, [dstq+dsq*2], 2 5700 vinserti32x4 m0, [dstq+r6 ], 3 5701 pmullw m2, m6 5702 psubw m1, m0, [tmpq] 5703 add maskq, 32 5704 add tmpq, 64 5705 pmulhrsw m1, m2 5706 paddw m0, m1 5707 mova [dstq+dsq*0], xm0 5708 vextracti32x4 [dstq+dsq*1], ym0, 1 5709 vextracti32x4 [dstq+dsq*2], m0, 2 5710 vextracti32x4 [dstq+r6 ], m0, 3 5711 lea dstq, [dstq+dsq*4] 5712 sub hd, 4 5713 jg .w8 5714 RET 5715.w16: 5716 pmovzxbw m4, [maskq+32*0] 5717 pmovzxbw m5, [maskq+32*1] 5718 mova ym0, [dstq+dsq*0] 5719 vinserti32x8 m0, [dstq+dsq*1], 1 5720 mova ym1, [dstq+dsq*2] 5721 vinserti32x8 m1, [dstq+r6 ], 1 5722 pmullw m4, m6 5723 pmullw m5, m6 5724 psubw m2, m0, [tmpq+64*0] 5725 psubw m3, m1, [tmpq+64*1] 5726 add maskq, 32*2 5727 add tmpq, 64*2 5728 pmulhrsw m2, m4 5729 pmulhrsw m3, m5 5730 paddw m0, m2 5731 paddw m1, m3 5732 mova [dstq+dsq*0], ym0 5733 vextracti32x8 [dstq+dsq*1], m0, 1 5734 mova [dstq+dsq*2], ym1 5735 vextracti32x8 [dstq+r6 ], m1, 1 5736 lea dstq, [dstq+dsq*4] 5737 sub hd, 4 5738 jg .w16 5739 RET 5740.w32: 5741 pmovzxbw m4, [maskq+32*0] 5742 pmovzxbw m5, [maskq+32*1] 5743 mova m0, [dstq+dsq*0] 5744 mova m1, [dstq+dsq*1] 5745 pmullw m4, m6 5746 pmullw m5, m6 5747 psubw m2, m0, [tmpq+ 64*0] 5748 psubw m3, m1, [tmpq+ 64*1] 5749 add maskq, 32*2 5750 add tmpq, 64*2 5751 pmulhrsw m2, m4 5752 pmulhrsw m3, m5 5753 paddw m0, m2 5754 paddw m1, m3 5755 mova [dstq+dsq*0], m0 5756 mova [dstq+dsq*1], m1 5757 lea dstq, [dstq+dsq*2] 5758 sub hd, 2 5759 jg .w32 5760 RET 5761 5762cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h 5763 lea r5, [blend_v_avx512icl_table] 5764 tzcnt wd, wm 5765 movifnidn hd, hm 5766 movsxd wq, [r5+wq*4] 5767 add wq, r5 5768 jmp wq 5769.w2: 5770 vpbroadcastd xmm2, [obmc_masks_avx2+2*2] 5771.w2_loop: 5772 movd xmm0, [dstq+dsq*0] 5773 pinsrd xmm0, [dstq+dsq*1], 1 5774 movq xmm1, [tmpq] 5775 add tmpq, 4*2 5776 psubw xmm1, xmm0, xmm1 5777 pmulhrsw xmm1, xmm2 5778 paddw xmm0, xmm1 5779 movd [dstq+dsq*0], xmm0 5780 pextrd [dstq+dsq*1], xmm0, 1 5781 lea dstq, [dstq+dsq*2] 5782 sub hd, 2 5783 jg .w2_loop 5784 RET 5785.w4: 5786 vpbroadcastq xmm2, [obmc_masks_avx2+4*2] 5787.w4_loop: 5788 movq xmm0, [dstq+dsq*0] 5789 movhps xmm0, [dstq+dsq*1] 5790 psubw xmm1, xmm0, [tmpq] 5791 add tmpq, 8*2 5792 pmulhrsw xmm1, xmm2 5793 paddw xmm0, xmm1 5794 movq [dstq+dsq*0], xmm0 5795 movhps [dstq+dsq*1], xmm0 5796 lea dstq, [dstq+dsq*2] 5797 sub hd, 2 5798 jg .w4_loop 5799 RET 5800.w8: 5801 vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2] 5802.w8_loop: 5803 mova xm0, [dstq+dsq*0] 5804 vinserti32x4 ym0, [dstq+dsq*1], 1 5805 psubw ym1, ym0, [tmpq] 5806 add tmpq, 16*2 5807 pmulhrsw ym1, ym2 5808 paddw ym0, ym1 5809 mova [dstq+dsq*0], xm0 5810 vextracti32x4 [dstq+dsq*1], ym0, 1 5811 lea dstq, [dstq+dsq*2] 5812 sub hd, 2 5813 jg .w8_loop 5814 RET 5815.w16: 5816 vbroadcasti32x8 m2, [obmc_masks_avx2+16*2] 5817.w16_loop: 5818 mova ym0, [dstq+dsq*0] 5819 vinserti32x8 m0, [dstq+dsq*1], 1 5820 psubw m1, m0, [tmpq] 5821 add tmpq, 32*2 5822 pmulhrsw m1, m2 5823 paddw m0, m1 5824 mova [dstq+dsq*0], ym0 5825 vextracti32x8 [dstq+dsq*1], m0, 1 5826 lea dstq, [dstq+dsq*2] 5827 sub hd, 2 5828 jg .w16_loop 5829 RET 5830.w32: 5831 mova m4, [obmc_masks_avx2+32*2] 5832.w32_loop: 5833 mova m0, [dstq+dsq*0] 5834 psubw m2, m0, [tmpq+ 64*0] 5835 mova m1, [dstq+dsq*1] 5836 psubw m3, m1, [tmpq+ 64*1] 5837 add tmpq, 64*2 5838 pmulhrsw m2, m4 5839 pmulhrsw m3, m4 5840 paddw m0, m2 5841 paddw m1, m3 5842 mova [dstq+dsq*0], m0 5843 mova [dstq+dsq*1], m1 5844 lea dstq, [dstq+dsq*2] 5845 sub hd, 2 5846 jg .w32_loop 5847 RET 5848 5849cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask 5850%define base r6-$$ 5851 lea r6, [$$] 5852 tzcnt wd, wm 5853 mov hd, hm 5854 movsxd wq, [base+blend_h_avx512icl_table+wq*4] 5855 lea maskq, [base+obmc_masks_avx2+hq*2] 5856 lea hd, [hq*3] 5857 lea wq, [base+blend_h_avx512icl_table+wq] 5858 shr hd, 2 ; h * 3/4 5859 lea maskq, [maskq+hq*2] 5860 neg hq 5861 jmp wq 5862.w2: 5863 movd xmm0, [dstq+dsq*0] 5864 pinsrd xmm0, [dstq+dsq*1], 1 5865 movd xmm2, [maskq+hq*2] 5866 movq xmm1, [tmpq] 5867 add tmpq, 4*2 5868 punpcklwd xmm2, xmm2 5869 psubw xmm1, xmm0, xmm1 5870 pmulhrsw xmm1, xmm2 5871 paddw xmm0, xmm1 5872 movd [dstq+dsq*0], xmm0 5873 pextrd [dstq+dsq*1], xmm0, 1 5874 lea dstq, [dstq+dsq*2] 5875 add hq, 2 5876 jl .w2 5877 RET 5878.w4: 5879 mova xmm3, [blend_shuf] 5880.w4_loop: 5881 movq xmm0, [dstq+dsq*0] 5882 movhps xmm0, [dstq+dsq*1] 5883 movd xmm2, [maskq+hq*2] 5884 psubw xmm1, xmm0, [tmpq] 5885 add tmpq, 8*2 5886 pshufb xmm2, xmm3 5887 pmulhrsw xmm1, xmm2 5888 paddw xmm0, xmm1 5889 movq [dstq+dsq*0], xmm0 5890 movhps [dstq+dsq*1], xmm0 5891 lea dstq, [dstq+dsq*2] 5892 add hq, 2 5893 jl .w4_loop 5894 RET 5895.w8: 5896 vbroadcasti32x4 ym3, [blend_shuf] 5897 shufpd ym3, ym3, 0x0c 5898.w8_loop: 5899 mova xm0, [dstq+dsq*0] 5900 vinserti32x4 ym0, [dstq+dsq*1], 1 5901 vpbroadcastd ym2, [maskq+hq*2] 5902 psubw ym1, ym0, [tmpq] 5903 add tmpq, 16*2 5904 pshufb ym2, ym3 5905 pmulhrsw ym1, ym2 5906 paddw ym0, ym1 5907 mova [dstq+dsq*0], xm0 5908 vextracti32x4 [dstq+dsq*1], ym0, 1 5909 lea dstq, [dstq+dsq*2] 5910 add hq, 2 5911 jl .w8_loop 5912 RET 5913.w16: 5914 vbroadcasti32x4 m3, [blend_shuf] 5915 shufpd m3, m3, 0xf0 5916.w16_loop: 5917 mova ym0, [dstq+dsq*0] 5918 vinserti32x8 m0, [dstq+dsq*1], 1 5919 vpbroadcastd m2, [maskq+hq*2] 5920 psubw m1, m0, [tmpq] 5921 add tmpq, 32*2 5922 pshufb m2, m3 5923 pmulhrsw m1, m2 5924 paddw m0, m1 5925 mova [dstq+dsq*0], ym0 5926 vextracti32x8 [dstq+dsq*1], m0, 1 5927 lea dstq, [dstq+dsq*2] 5928 add hq, 2 5929 jl .w16_loop 5930 RET 5931.w32: 5932 vpbroadcastw m4, [maskq+hq*2] 5933 vpbroadcastw m5, [maskq+hq*2+2] 5934 mova m0, [dstq+dsq*0] 5935 psubw m2, m0, [tmpq+ 64*0] 5936 mova m1, [dstq+dsq*1] 5937 psubw m3, m1, [tmpq+ 64*1] 5938 add tmpq, 64*2 5939 pmulhrsw m2, m4 5940 pmulhrsw m3, m5 5941 paddw m0, m2 5942 paddw m1, m3 5943 mova [dstq+dsq*0], m0 5944 mova [dstq+dsq*1], m1 5945 lea dstq, [dstq+dsq*2] 5946 add hq, 2 5947 jl .w32 5948 RET 5949.w64: 5950 vpbroadcastw m4, [maskq+hq*2] 5951 mova m0, [dstq+64*0] 5952 psubw m2, m0, [tmpq+64*0] 5953 mova m1, [dstq+64*1] 5954 psubw m3, m1, [tmpq+64*1] 5955 add tmpq, 64*2 5956 pmulhrsw m2, m4 5957 pmulhrsw m3, m4 5958 paddw m0, m2 5959 paddw m1, m3 5960 mova [dstq+64*0], m0 5961 mova [dstq+64*1], m1 5962 add dstq, dsq 5963 inc hq 5964 jl .w64 5965 RET 5966.w128: 5967 vpbroadcastw m8, [maskq+hq*2] 5968 mova m0, [dstq+64*0] 5969 psubw m4, m0, [tmpq+64*0] 5970 mova m1, [dstq+64*1] 5971 psubw m5, m1, [tmpq+64*1] 5972 mova m2, [dstq+64*2] 5973 psubw m6, m2, [tmpq+64*2] 5974 mova m3, [dstq+64*3] 5975 psubw m7, m3, [tmpq+64*3] 5976 add tmpq, 64*4 5977 REPX {pmulhrsw x, m8}, m4, m5, m6, m7 5978 paddw m0, m4 5979 paddw m1, m5 5980 paddw m2, m6 5981 paddw m3, m7 5982 mova [dstq+64*0], m0 5983 mova [dstq+64*1], m1 5984 mova [dstq+64*2], m2 5985 mova [dstq+64*3], m3 5986 add dstq, dsq 5987 inc hq 5988 jl .w128 5989 RET 5990 5991cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \ 5992 dst_w, h, src_w, dx, mx0, pxmax 5993 sub dword mx0m, 4<<14 5994 sub dword src_wm, 8 5995 mov r6, ~0 5996 vpbroadcastd m5, dxm 5997 vpbroadcastd m8, mx0m 5998 vpbroadcastd m6, src_wm 5999 kmovq k6, r6 6000 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax 6001 LEA r7, $$ 6002%define base r7-$$ 6003 vpbroadcastd m3, [base+pd_16384] 6004 vpbroadcastd m7, [base+pd_63] 6005 mova m24, [base+resize_permA] 6006 mova m25, [base+resize_permB] 6007 mova m26, [base+resize_permC] 6008 mova m27, [base+resize_permD] 6009 vbroadcasti32x4 m28, [base+resize_shufA] 6010 vbroadcasti32x4 m29, [base+resize_shufB] 6011 mova m30, [base+resize_permE] 6012 vpbroadcastw ym31, pxmaxm 6013 vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15] 6014 pslld m5, 4 ; dx*16 6015 pslld m6, 14 6016 pxor m2, m2 6017.loop_y: 6018 xor xd, xd 6019 mova m4, m8 ; per-line working version of mx 6020.loop_x: 6021 pmaxsd m0, m4, m2 6022 psrad m9, m4, 8 ; filter offset (unmasked) 6023 pminsd m0, m6 ; iclip(mx, 0, src_w-8) 6024 psubd m1, m4, m0 ; pshufb offset 6025 psrad m0, 14 ; clipped src_x offset 6026 psrad m1, 14 ; pshufb edge_emu offset 6027 vptestmd k5, m1, m1 6028 pand m9, m7 ; filter offset (masked) 6029 ktestw k5, k5 6030 jz .load 6031 vpbroadcastq m14, [base+pd_0_4] 6032 vpermq m10, m0, q1100 6033 vpermq m11, m0, q3322 6034 vpermq m20, m1, q1100 6035 vpermq m21, m1, q3322 6036 punpckldq m10, m10 6037 punpckldq m11, m11 6038 punpckldq m20, m20 6039 punpckldq m21, m21 6040 paddd m10, m14 6041 paddd m11, m14 6042 paddd m20, m14 6043 paddd m21, m14 6044 vextracti32x8 ym12, m10, 1 6045 vextracti32x8 ym13, m11, 1 6046 vextracti32x8 ym22, m20, 1 6047 vextracti32x8 ym23, m21, 1 6048 kmovq k1, k6 6049 kmovq k2, k6 6050 kmovq k3, k6 6051 kmovq k4, k6 6052 vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3 6053 vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7 6054 vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B 6055 vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F 6056 kmovq k1, k6 6057 kmovq k2, k6 6058 kmovq k3, k6 6059 kmovq k4, k6 6060 vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2] 6061 vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2] 6062 vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2] 6063 vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2] 6064 pshufb m16, m0 6065 pshufb m17, m1 6066 pshufb m18, m14 6067 pshufb m19, m15 6068 mova m20, m24 6069 mova m22, m24 6070 mova m21, m25 6071 mova m23, m25 6072 vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b 6073 vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d 6074 vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb 6075 vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd 6076 mova m15, m26 6077 mova m17, m26 6078 mova m16, m27 6079 mova m18, m27 6080 vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa 6081 vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb 6082 vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc 6083 vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd 6084 kmovq k1, k6 6085 kmovq k2, k6 6086 vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] 6087 vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] 6088 pshufb m10, m11, m28 6089 pshufb m11, m11, m29 6090 pshufb m12, m13, m28 6091 pshufb m13, m13, m29 6092 jmp .filter 6093.load: 6094 kmovq k1, k6 6095 kmovq k2, k6 6096 kmovq k3, k6 6097 kmovq k4, k6 6098 vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] 6099 vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] 6100 pshufb m10, m11, m28 6101 pshufb m11, m11, m29 6102 pshufb m12, m13, m28 6103 pshufb m13, m13, m29 6104 vpgatherdd m15{k3}, [srcq+m0*2+ 0] 6105 vpgatherdd m16{k4}, [srcq+m0*2+ 4] 6106 kmovq k1, k6 6107 kmovq k2, k6 6108 vpgatherdd m17{k1}, [srcq+m0*2+ 8] 6109 vpgatherdd m18{k2}, [srcq+m0*2+12] 6110.filter: 6111 mova m14, m2 6112 vpdpwssd m14, m15, m10 6113 vpdpwssd m14, m16, m11 6114 vpdpwssd m14, m17, m12 6115 vpdpwssd m14, m18, m13 6116 psubd m14, m3, m14 6117 psrad m14, 15 6118 packusdw m14, m14 6119 vpermq m14, m30, m14 6120 pminsw ym14, ym31 6121 mova [dstq+xq*2], ym14 6122 paddd m4, m5 6123 add xd, 16 6124 cmp xd, dst_wd 6125 jl .loop_x 6126 add dstq, dst_strideq 6127 add srcq, src_strideq 6128 dec hd 6129 jg .loop_y 6130 RET 6131 6132%endif ; ARCH_X86_64 6133