1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; Copyright © 2018, VideoLabs 4; All rights reserved. 5; 6; Redistribution and use in source and binary forms, with or without 7; modification, are permitted provided that the following conditions are met: 8; 9; 1. Redistributions of source code must retain the above copyright notice, this 10; list of conditions and the following disclaimer. 11; 12; 2. Redistributions in binary form must reproduce the above copyright notice, 13; this list of conditions and the following disclaimer in the documentation 14; and/or other materials provided with the distribution. 15; 16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 27%include "config.asm" 28%include "ext/x86/x86inc.asm" 29 30SECTION_RODATA 16 31 32; dav1d_obmc_masks[] with 64-x interleaved 33obmc_masks: db 0, 0, 0, 0 34 ; 2 @4 35 db 45, 19, 64, 0 36 ; 4 @8 37 db 39, 25, 50, 14, 59, 5, 64, 0 38 ; 8 @16 39 db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 40 ; 16 @32 41 db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 42 db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 43 ; 32 @64 44 db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 45 db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 46 db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 47 48warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 49warp_8x8_shufB: db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 50warp_8x8_shufC: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 51warp_8x8_shufD: db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 52blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 53subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 54 db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 55subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 56subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 57subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 58subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 59subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 60bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 61bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 62unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 63rescale_mul: dd 0, 1, 2, 3 64resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 65 66wm_420_sign: times 4 dw 258 67 times 4 dw 257 68wm_422_sign: times 8 db 128 69 times 8 db 127 70 71pb_8x0_8x8: times 8 db 0 72 times 8 db 8 73bdct_lb_dw: times 4 db 0 74 times 4 db 4 75 times 4 db 8 76 times 4 db 12 77 78pb_64: times 16 db 64 79pw_m256: times 8 dw -256 80pw_1: times 8 dw 1 81pw_2: times 8 dw 2 82pw_8: times 8 dw 8 83pw_15: times 8 dw 15 84pw_26: times 8 dw 26 85pw_34: times 8 dw 34 86pw_512: times 8 dw 512 87pw_1024: times 8 dw 1024 88pw_2048: times 8 dw 2048 89pw_6903: times 8 dw 6903 90pw_8192: times 8 dw 8192 91pd_32: times 4 dd 32 92pd_63: times 4 dd 63 93pd_512: times 4 dd 512 94pd_16384: times 4 dd 16484 95pd_32768: times 4 dd 32768 96pd_262144:times 4 dd 262144 97pd_0x3ff: times 4 dd 0x3ff 98pd_0x4000:times 4 dd 0x4000 99pq_0x40000000: times 2 dq 0x40000000 100 101const mc_warp_filter2 ; dav1d_mc_warp_filter[] reordered for pmaddubsw usage 102 ; [-1, 0) 103 db 0, 127, 0, 0, 0, 1, 0, 0, 0, 127, 0, 0, -1, 2, 0, 0 104 db 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1, 0 105 db 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1, 0 106 db 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1, 0 107 db 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1, 0 108 db 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2, 0 109 db 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2, 0 110 db 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2, 0 111 db 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3, 0 112 db 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3, 0 113 db 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3, 0 114 db 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4, 0 115 db 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4, 0 116 db 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4, 0 117 db 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4, 0 118 db 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4, 0 119 db 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4, 0 120 db 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4, 0 121 db 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4, 0 122 db 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4, 0 123 db 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4, 0 124 db 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4, 0 125 db 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4, 0 126 db 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3, 0 127 db 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3, 0 128 db 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3, 0 129 db 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2, 0 130 db 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2, 0 131 db 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2, 0 132 db 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1, 0 133 db 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1, 0 134 db 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0, 0 135 ; [0, 1) 136 db 0, 0, 1, 0, 0, 127, 0, 0, 0, -1, 2, 0, 0, 127, 0, 0 137 db 0, -3, 4, 1, 1, 127, -2, 0, 0, -5, 6, 1, 1, 127, -2, 0 138 db 0, -6, 8, 1, 2, 126, -3, 0, -1, -7, 11, 2, 2, 126, -4, -1 139 db -1, -8, 13, 2, 3, 125, -5, -1, -1, -10, 16, 3, 3, 124, -6, -1 140 db -1, -11, 18, 3, 4, 123, -7, -1, -1, -12, 20, 3, 4, 122, -7, -1 141 db -1, -13, 23, 3, 4, 121, -8, -1, -2, -14, 25, 4, 5, 120, -9, -1 142 db -1, -15, 27, 4, 5, 119, -10, -1, -1, -16, 30, 4, 5, 118, -11, -1 143 db -2, -17, 33, 5, 6, 116, -12, -1, -2, -17, 35, 5, 6, 114, -12, -1 144 db -2, -18, 38, 5, 6, 113, -13, -1, -2, -19, 41, 6, 7, 111, -14, -2 145 db -2, -19, 43, 6, 7, 110, -15, -2, -2, -20, 46, 6, 7, 108, -15, -2 146 db -2, -20, 49, 6, 7, 106, -16, -2, -2, -21, 51, 7, 7, 104, -16, -2 147 db -2, -21, 54, 7, 7, 102, -17, -2, -2, -21, 56, 7, 8, 100, -18, -2 148 db -2, -22, 59, 7, 8, 98, -18, -2, -2, -22, 62, 7, 8, 96, -19, -2 149 db -2, -22, 64, 7, 8, 94, -19, -2, -2, -22, 67, 8, 8, 91, -20, -2 150 db -2, -22, 69, 8, 8, 89, -20, -2, -2, -22, 72, 8, 8, 87, -21, -2 151 db -2, -21, 74, 8, 8, 84, -21, -2, -2, -22, 77, 8, 8, 82, -21, -2 152 db -2, -21, 79, 8, 8, 79, -21, -2, -2, -21, 82, 8, 8, 77, -22, -2 153 db -2, -21, 84, 8, 8, 74, -21, -2, -2, -21, 87, 8, 8, 72, -22, -2 154 db -2, -20, 89, 8, 8, 69, -22, -2, -2, -20, 91, 8, 8, 67, -22, -2 155 db -2, -19, 94, 8, 7, 64, -22, -2, -2, -19, 96, 8, 7, 62, -22, -2 156 db -2, -18, 98, 8, 7, 59, -22, -2, -2, -18, 100, 8, 7, 56, -21, -2 157 db -2, -17, 102, 7, 7, 54, -21, -2, -2, -16, 104, 7, 7, 51, -21, -2 158 db -2, -16, 106, 7, 6, 49, -20, -2, -2, -15, 108, 7, 6, 46, -20, -2 159 db -2, -15, 110, 7, 6, 43, -19, -2, -2, -14, 111, 7, 6, 41, -19, -2 160 db -1, -13, 113, 6, 5, 38, -18, -2, -1, -12, 114, 6, 5, 35, -17, -2 161 db -1, -12, 116, 6, 5, 33, -17, -2, -1, -11, 118, 5, 4, 30, -16, -1 162 db -1, -10, 119, 5, 4, 27, -15, -1, -1, -9, 120, 5, 4, 25, -14, -2 163 db -1, -8, 121, 4, 3, 23, -13, -1, -1, -7, 122, 4, 3, 20, -12, -1 164 db -1, -7, 123, 4, 3, 18, -11, -1, -1, -6, 124, 3, 3, 16, -10, -1 165 db -1, -5, 125, 3, 2, 13, -8, -1, -1, -4, 126, 2, 2, 11, -7, -1 166 db 0, -3, 126, 2, 1, 8, -6, 0, 0, -2, 127, 1, 1, 6, -5, 0 167 db 0, -2, 127, 1, 1, 4, -3, 0, 0, 0, 127, 0, 0, 2, -1, 0 168 ; [1, 2) 169 db 0, 0, 127, 0, 0, 1, 0, 0, 0, 0, 127, 0, 0, -1, 2, 0 170 db 0, 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1 171 db 0, 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1 172 db 0, 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1 173 db 0, 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1 174 db 0, 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2 175 db 0, 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2 176 db 0, 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2 177 db 0, 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3 178 db 0, 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3 179 db 0, 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3 180 db 0, 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4 181 db 0, 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4 182 db 0, 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4 183 db 0, 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4 184 db 0, 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4 185 db 0, 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4 186 db 0, 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4 187 db 0, 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4 188 db 0, 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4 189 db 0, 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4 190 db 0, 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4 191 db 0, 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4 192 db 0, 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3 193 db 0, 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3 194 db 0, 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3 195 db 0, 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2 196 db 0, 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2 197 db 0, 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2 198 db 0, 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1 199 db 0, 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1 200 db 0, 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0 201 db 0, 0, 2, -1, 0, 0, 127, 0 202 203pw_258: times 2 dw 258 204 205cextern mc_subpel_filters 206%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) 207 208%macro BIDIR_JMP_TABLE 2-* 209 ;evaluated at definition time (in loop below) 210 %xdefine %1_%2_table (%%table - 2*%3) 211 %xdefine %%base %1_%2_table 212 %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) 213 ; dynamically generated label 214 %%table: 215 %rep %0 - 2 ; repeat for num args 216 dd %%prefix %+ .w%3 - %%base 217 %rotate 1 218 %endrep 219%endmacro 220 221BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 222BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 223BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 224BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 225BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 226BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 227BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 228BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32 229BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 16, 16, 16 230 231%macro BASE_JMP_TABLE 3-* 232 %xdefine %1_%2_table (%%table - %3) 233 %xdefine %%base %1_%2 234 %%table: 235 %rep %0 - 2 236 dw %%base %+ _w%3 - %%base 237 %rotate 1 238 %endrep 239%endmacro 240 241%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_8bpc_sse2.prep) 242%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_8bpc_ssse3.put) 243%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_8bpc_ssse3.prep) 244 245BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 246BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 247 248%macro HV_JMP_TABLE 5-* 249 %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) 250 %xdefine %%base %1_%3 251 %assign %%types %4 252 %if %%types & 1 253 %xdefine %1_%2_h_%3_table (%%h - %5) 254 %%h: 255 %rep %0 - 4 256 dw %%prefix %+ .h_w%5 - %%base 257 %rotate 1 258 %endrep 259 %rotate 4 260 %endif 261 %if %%types & 2 262 %xdefine %1_%2_v_%3_table (%%v - %5) 263 %%v: 264 %rep %0 - 4 265 dw %%prefix %+ .v_w%5 - %%base 266 %rotate 1 267 %endrep 268 %rotate 4 269 %endif 270 %if %%types & 4 271 %xdefine %1_%2_hv_%3_table (%%hv - %5) 272 %%hv: 273 %rep %0 - 4 274 dw %%prefix %+ .hv_w%5 - %%base 275 %rotate 1 276 %endrep 277 %endif 278%endmacro 279 280HV_JMP_TABLE prep, 8tap, sse2, 1, 4, 8, 16, 32, 64, 128 281HV_JMP_TABLE prep, bilin, sse2, 7, 4, 8, 16, 32, 64, 128 282HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128 283HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128 284HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128 285HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128 286 287%macro SCALED_JMP_TABLE 2-* 288 %xdefine %1_%2_table (%%table - %3) 289 %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) 290%%table: 291 %rep %0 - 2 292 dw %%base %+ .w%3 - %%base 293 %rotate 1 294 %endrep 295 %rotate 2 296%%dy_1024: 297 %xdefine %1_%2_dy1_table (%%dy_1024 - %3) 298 %rep %0 - 2 299 dw %%base %+ .dy1_w%3 - %%base 300 %rotate 1 301 %endrep 302 %rotate 2 303%%dy_2048: 304 %xdefine %1_%2_dy2_table (%%dy_2048 - %3) 305 %rep %0 - 2 306 dw %%base %+ .dy2_w%3 - %%base 307 %rotate 1 308 %endrep 309%endmacro 310 311SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128 312SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128 313 314%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX 315 316SECTION .text 317 318INIT_XMM ssse3 319 320%if ARCH_X86_32 321 DECLARE_REG_TMP 1 322 %define base t0-put_ssse3 323%else 324 DECLARE_REG_TMP 7 325 %define base 0 326%endif 327 328%macro RESTORE_DSQ_32 1 329 %if ARCH_X86_32 330 mov %1, dsm ; restore dsq 331 %endif 332%endmacro 333 334cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy 335 movifnidn mxyd, r6m ; mx 336 LEA t0, put_ssse3 337 movifnidn srcq, srcmp 338 movifnidn ssq, ssmp 339 tzcnt wd, wm 340 mov hd, hm 341 test mxyd, mxyd 342 jnz .h 343 mov mxyd, r7m ; my 344 test mxyd, mxyd 345 jnz .v 346.put: 347 movzx wd, word [t0+wq*2+table_offset(put,)] 348 add wq, t0 349 RESTORE_DSQ_32 t0 350 jmp wq 351.put_w2: 352 movzx r4d, word [srcq+ssq*0] 353 movzx r6d, word [srcq+ssq*1] 354 lea srcq, [srcq+ssq*2] 355 mov [dstq+dsq*0], r4w 356 mov [dstq+dsq*1], r6w 357 lea dstq, [dstq+dsq*2] 358 sub hd, 2 359 jg .put_w2 360 RET 361.put_w4: 362 mov r4d, [srcq+ssq*0] 363 mov r6d, [srcq+ssq*1] 364 lea srcq, [srcq+ssq*2] 365 mov [dstq+dsq*0], r4d 366 mov [dstq+dsq*1], r6d 367 lea dstq, [dstq+dsq*2] 368 sub hd, 2 369 jg .put_w4 370 RET 371.put_w8: 372 movq m0, [srcq+ssq*0] 373 movq m1, [srcq+ssq*1] 374 lea srcq, [srcq+ssq*2] 375 movq [dstq+dsq*0], m0 376 movq [dstq+dsq*1], m1 377 lea dstq, [dstq+dsq*2] 378 sub hd, 2 379 jg .put_w8 380 RET 381.put_w16: 382 movu m0, [srcq+ssq*0] 383 movu m1, [srcq+ssq*1] 384 lea srcq, [srcq+ssq*2] 385 mova [dstq+dsq*0], m0 386 mova [dstq+dsq*1], m1 387 lea dstq, [dstq+dsq*2] 388 sub hd, 2 389 jg .put_w16 390 RET 391.put_w32: 392 movu m0, [srcq+ssq*0+16*0] 393 movu m1, [srcq+ssq*0+16*1] 394 movu m2, [srcq+ssq*1+16*0] 395 movu m3, [srcq+ssq*1+16*1] 396 lea srcq, [srcq+ssq*2] 397 mova [dstq+dsq*0+16*0], m0 398 mova [dstq+dsq*0+16*1], m1 399 mova [dstq+dsq*1+16*0], m2 400 mova [dstq+dsq*1+16*1], m3 401 lea dstq, [dstq+dsq*2] 402 sub hd, 2 403 jg .put_w32 404 RET 405.put_w64: 406 movu m0, [srcq+16*0] 407 movu m1, [srcq+16*1] 408 movu m2, [srcq+16*2] 409 movu m3, [srcq+16*3] 410 add srcq, ssq 411 mova [dstq+16*0], m0 412 mova [dstq+16*1], m1 413 mova [dstq+16*2], m2 414 mova [dstq+16*3], m3 415 add dstq, dsq 416 dec hd 417 jg .put_w64 418 RET 419.put_w128: 420 movu m0, [srcq+16*0] 421 movu m1, [srcq+16*1] 422 movu m2, [srcq+16*2] 423 movu m3, [srcq+16*3] 424 mova [dstq+16*0], m0 425 mova [dstq+16*1], m1 426 mova [dstq+16*2], m2 427 mova [dstq+16*3], m3 428 movu m0, [srcq+16*4] 429 movu m1, [srcq+16*5] 430 movu m2, [srcq+16*6] 431 movu m3, [srcq+16*7] 432 mova [dstq+16*4], m0 433 mova [dstq+16*5], m1 434 mova [dstq+16*6], m2 435 mova [dstq+16*7], m3 436 add srcq, ssq 437 add dstq, dsq 438 dec hd 439 jg .put_w128 440 RET 441.h: 442 ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 443 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 444 imul mxyd, 0x00ff00ff 445 mova m4, [base+bilin_h_shuf8] 446 mova m0, [base+bilin_h_shuf4] 447 add mxyd, 0x00100010 448 movd m5, mxyd 449 mov mxyd, r7m ; my 450 pshufd m5, m5, q0000 451 test mxyd, mxyd 452 jnz .hv 453 movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)] 454 mova m3, [base+pw_2048] 455 add wq, t0 456 movifnidn dsq, dsmp 457 jmp wq 458.h_w2: 459 pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5} 460.h_w2_loop: 461 movd m0, [srcq+ssq*0] 462 movd m1, [srcq+ssq*1] 463 lea srcq, [srcq+ssq*2] 464 punpckldq m0, m1 465 pshufb m0, m4 466 pmaddubsw m0, m5 467 pmulhrsw m0, m3 468 packuswb m0, m0 469 movd r6d, m0 470 mov [dstq+dsq*0], r6w 471 shr r6d, 16 472 mov [dstq+dsq*1], r6w 473 lea dstq, [dstq+dsq*2] 474 sub hd, 2 475 jg .h_w2_loop 476 RET 477.h_w4: 478 movq m4, [srcq+ssq*0] 479 movhps m4, [srcq+ssq*1] 480 lea srcq, [srcq+ssq*2] 481 pshufb m4, m0 482 pmaddubsw m4, m5 483 pmulhrsw m4, m3 484 packuswb m4, m4 485 movd [dstq+dsq*0], m4 486 psrlq m4, 32 487 movd [dstq+dsq*1], m4 488 lea dstq, [dstq+dsq*2] 489 sub hd, 2 490 jg .h_w4 491 RET 492.h_w8: 493 movu m0, [srcq+ssq*0] 494 movu m1, [srcq+ssq*1] 495 lea srcq, [srcq+ssq*2] 496 pshufb m0, m4 497 pshufb m1, m4 498 pmaddubsw m0, m5 499 pmaddubsw m1, m5 500 pmulhrsw m0, m3 501 pmulhrsw m1, m3 502 packuswb m0, m1 503 movq [dstq+dsq*0], m0 504 movhps [dstq+dsq*1], m0 505 lea dstq, [dstq+dsq*2] 506 sub hd, 2 507 jg .h_w8 508 RET 509.h_w16: 510 movu m0, [srcq+8*0] 511 movu m1, [srcq+8*1] 512 add srcq, ssq 513 pshufb m0, m4 514 pshufb m1, m4 515 pmaddubsw m0, m5 516 pmaddubsw m1, m5 517 pmulhrsw m0, m3 518 pmulhrsw m1, m3 519 packuswb m0, m1 520 mova [dstq], m0 521 add dstq, dsq 522 dec hd 523 jg .h_w16 524 RET 525.h_w32: 526 movu m0, [srcq+mmsize*0+8*0] 527 movu m1, [srcq+mmsize*0+8*1] 528 pshufb m0, m4 529 pshufb m1, m4 530 pmaddubsw m0, m5 531 pmaddubsw m1, m5 532 pmulhrsw m0, m3 533 pmulhrsw m1, m3 534 packuswb m0, m1 535 movu m1, [srcq+mmsize*1+8*0] 536 movu m2, [srcq+mmsize*1+8*1] 537 add srcq, ssq 538 pshufb m1, m4 539 pshufb m2, m4 540 pmaddubsw m1, m5 541 pmaddubsw m2, m5 542 pmulhrsw m1, m3 543 pmulhrsw m2, m3 544 packuswb m1, m2 545 mova [dstq+16*0], m0 546 mova [dstq+16*1], m1 547 add dstq, dsq 548 dec hd 549 jg .h_w32 550 RET 551.h_w64: 552 mov r6, -16*3 553.h_w64_loop: 554 movu m0, [srcq+r6+16*3+8*0] 555 movu m1, [srcq+r6+16*3+8*1] 556 pshufb m0, m4 557 pshufb m1, m4 558 pmaddubsw m0, m5 559 pmaddubsw m1, m5 560 pmulhrsw m0, m3 561 pmulhrsw m1, m3 562 packuswb m0, m1 563 mova [dstq+r6+16*3], m0 564 add r6, 16 565 jle .h_w64_loop 566 add srcq, ssq 567 add dstq, dsq 568 dec hd 569 jg .h_w64 570 RET 571.h_w128: 572 mov r6, -16*7 573.h_w128_loop: 574 movu m0, [srcq+r6+16*7+8*0] 575 movu m1, [srcq+r6+16*7+8*1] 576 pshufb m0, m4 577 pshufb m1, m4 578 pmaddubsw m0, m5 579 pmaddubsw m1, m5 580 pmulhrsw m0, m3 581 pmulhrsw m1, m3 582 packuswb m0, m1 583 mova [dstq+r6+16*7], m0 584 add r6, 16 585 jle .h_w128_loop 586 add srcq, ssq 587 add dstq, dsq 588 dec hd 589 jg .h_w128 590 RET 591.v: 592 movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)] 593 imul mxyd, 0x00ff00ff 594 mova m5, [base+pw_2048] 595 add mxyd, 0x00100010 596 add wq, t0 597 movd m4, mxyd 598 pshufd m4, m4, q0000 599 movifnidn dsq, dsmp 600 jmp wq 601.v_w2: 602 movd m0, [srcq+ssq*0] 603.v_w2_loop: 604 pinsrw m0, [srcq+ssq*1], 1 ; 0 1 605 lea srcq, [srcq+ssq*2] 606 pshuflw m1, m0, q2301 607 pinsrw m0, [srcq+ssq*0], 0 ; 2 1 608 punpcklbw m1, m0 609 pmaddubsw m1, m4 610 pmulhrsw m1, m5 611 packuswb m1, m1 612 movd r6d, m1 613 mov [dstq+dsq*1], r6w 614 shr r6d, 16 615 mov [dstq+dsq*0], r6w 616 lea dstq, [dstq+dsq*2] 617 sub hd, 2 618 jg .v_w2_loop 619 RET 620.v_w4: 621 movd m0, [srcq+ssq*0] 622.v_w4_loop: 623 movd m2, [srcq+ssq*1] 624 lea srcq, [srcq+ssq*2] 625 mova m1, m0 626 movd m0, [srcq+ssq*0] 627 punpckldq m1, m2 ; 0 1 628 punpckldq m2, m0 ; 1 2 629 punpcklbw m1, m2 630 pmaddubsw m1, m4 631 pmulhrsw m1, m5 632 packuswb m1, m1 633 movd [dstq+dsq*0], m1 634 psrlq m1, 32 635 movd [dstq+dsq*1], m1 636 ; 637 lea dstq, [dstq+dsq*2] 638 sub hd, 2 639 jg .v_w4_loop 640 RET 641.v_w8: 642 movq m0, [srcq+ssq*0] 643.v_w8_loop: 644 movq m2, [srcq+ssq*1] 645 lea srcq, [srcq+ssq*2] 646 mova m1, m0 647 movq m0, [srcq+ssq*0] 648 punpcklbw m1, m2 649 punpcklbw m2, m0 650 pmaddubsw m1, m4 651 pmaddubsw m2, m4 652 pmulhrsw m1, m5 653 pmulhrsw m2, m5 654 packuswb m1, m2 655 movq [dstq+dsq*0], m1 656 movhps [dstq+dsq*1], m1 657 lea dstq, [dstq+dsq*2] 658 sub hd, 2 659 jg .v_w8_loop 660 RET 661%macro PUT_BILIN_V_W16 0 662 movu m0, [srcq+ssq*0] 663%%loop: 664 movu m3, [srcq+ssq*1] 665 lea srcq, [srcq+ssq*2] 666 mova m1, m0 667 mova m2, m0 668 movu m0, [srcq+ssq*0] 669 punpcklbw m1, m3 670 punpckhbw m2, m3 671 pmaddubsw m1, m4 672 pmaddubsw m2, m4 673 pmulhrsw m1, m5 674 pmulhrsw m2, m5 675 packuswb m1, m2 676 punpcklbw m2, m3, m0 677 punpckhbw m3, m0 678 pmaddubsw m2, m4 679 pmaddubsw m3, m4 680 pmulhrsw m2, m5 681 pmulhrsw m3, m5 682 packuswb m2, m3 683 mova [dstq+dsq*0], m1 684 mova [dstq+dsq*1], m2 685 lea dstq, [dstq+dsq*2] 686 sub hd, 2 687 jg %%loop 688%endmacro 689.v_w16: 690 PUT_BILIN_V_W16 691 RET 692.v_w128: 693 lea r6d, [hq+(7<<16)] 694 jmp .v_w16gt 695.v_w64: 696 lea r6d, [hq+(3<<16)] 697 jmp .v_w16gt 698.v_w32: 699 lea r6d, [hq+(1<<16)] 700.v_w16gt: 701 mov r4, srcq 702%if ARCH_X86_64 703 mov r7, dstq 704%endif 705.v_w16gt_loop: 706 PUT_BILIN_V_W16 707%if ARCH_X86_64 708 add r4, 16 709 add r7, 16 710 movzx hd, r6b 711 mov srcq, r4 712 mov dstq, r7 713%else 714 mov dstq, dstmp 715 add r4, 16 716 movzx hd, r6w 717 add dstq, 16 718 mov srcq, r4 719 mov dstmp, dstq 720%endif 721 sub r6d, 1<<16 722 jg .v_w16gt 723 RET 724.hv: 725 ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 726 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 727 movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)] 728 WIN64_SPILL_XMM 8 729 shl mxyd, 11 ; can't shift by 12 due to signed overflow 730 mova m7, [base+pw_15] 731 movd m6, mxyd 732 add wq, t0 733 pshuflw m6, m6, q0000 734 paddb m5, m5 735 punpcklqdq m6, m6 736 jmp wq 737.hv_w2: 738 RESTORE_DSQ_32 t0 739 movd m0, [srcq+ssq*0] 740 punpckldq m0, m0 741 pshufb m0, m4 742 pmaddubsw m0, m5 743.hv_w2_loop: 744 movd m1, [srcq+ssq*1] 745 lea srcq, [srcq+ssq*2] 746 movd m2, [srcq+ssq*0] 747 punpckldq m1, m2 748 pshufb m1, m4 749 pmaddubsw m1, m5 ; 1 _ 2 _ 750 shufps m2, m0, m1, q1032 ; 0 _ 1 _ 751 mova m0, m1 752 psubw m1, m2 ; 2 * (src[x + src_stride] - src[x]) 753 pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) >> 4 754 pavgw m2, m7 ; src[x] + 8 755 paddw m1, m2 ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8 756 psrlw m1, 4 757 packuswb m1, m1 758%if ARCH_X86_64 759 movq r6, m1 760%else 761 pshuflw m1, m1, q2020 762 movd r6d, m1 763%endif 764 mov [dstq+dsq*0], r6w 765 shr r6, gprsize*4 766 mov [dstq+dsq*1], r6w 767 lea dstq, [dstq+dsq*2] 768 sub hd, 2 769 jg .hv_w2_loop 770 RET 771.hv_w4: 772 mova m4, [base+bilin_h_shuf4] 773 movddup m0, [srcq+ssq*0] 774 movifnidn dsq, dsmp 775 pshufb m0, m4 776 pmaddubsw m0, m5 777.hv_w4_loop: 778 movq m1, [srcq+ssq*1] 779 lea srcq, [srcq+ssq*2] 780 movhps m1, [srcq+ssq*0] 781 pshufb m1, m4 782 pmaddubsw m1, m5 ; 1 2 783 shufps m2, m0, m1, q1032 ; 0 1 784 mova m0, m1 785 psubw m1, m2 786 pmulhw m1, m6 787 pavgw m2, m7 788 paddw m1, m2 789 psrlw m1, 4 790 packuswb m1, m1 791 movd [dstq+dsq*0], m1 792 psrlq m1, 32 793 movd [dstq+dsq*1], m1 794 lea dstq, [dstq+dsq*2] 795 sub hd, 2 796 jg .hv_w4_loop 797 RET 798.hv_w8: 799 movu m0, [srcq+ssq*0] 800 movifnidn dsq, dsmp 801 pshufb m0, m4 802 pmaddubsw m0, m5 803.hv_w8_loop: 804 movu m2, [srcq+ssq*1] 805 lea srcq, [srcq+ssq*2] 806 pshufb m2, m4 807 pmaddubsw m2, m5 808 psubw m1, m2, m0 809 pmulhw m1, m6 810 pavgw m0, m7 811 paddw m1, m0 812 movu m0, [srcq+ssq*0] 813 pshufb m0, m4 814 pmaddubsw m0, m5 815 psubw m3, m0, m2 816 pmulhw m3, m6 817 pavgw m2, m7 818 paddw m3, m2 819 psrlw m1, 4 820 psrlw m3, 4 821 packuswb m1, m3 822 movq [dstq+dsq*0], m1 823 movhps [dstq+dsq*1], m1 824 lea dstq, [dstq+dsq*2] 825 sub hd, 2 826 jg .hv_w8_loop 827 RET 828.hv_w128: 829 lea r6d, [hq+(7<<16)] 830 jmp .hv_w16_start 831.hv_w64: 832 lea r6d, [hq+(3<<16)] 833 jmp .hv_w16_start 834.hv_w32: 835 lea r6d, [hq+(1<<16)] 836.hv_w16_start: 837 mov r4, srcq 838%if ARCH_X86_32 839 %define m8 [dstq] 840%else 841 mov r7, dstq 842%endif 843.hv_w16: 844 movifnidn dsq, dsmp 845%if WIN64 846 movaps r4m, m8 847%endif 848.hv_w16_loop0: 849 movu m0, [srcq+8*0] 850 movu m1, [srcq+8*1] 851 pshufb m0, m4 852 pshufb m1, m4 853 pmaddubsw m0, m5 854 pmaddubsw m1, m5 855.hv_w16_loop: 856 add srcq, ssq 857 movu m2, [srcq+8*0] 858 movu m3, [srcq+8*1] 859 pshufb m2, m4 860 pshufb m3, m4 861 pmaddubsw m2, m5 862 pmaddubsw m3, m5 863 mova m8, m2 864 psubw m2, m0 865 pmulhw m2, m6 866 pavgw m0, m7 867 paddw m2, m0 868 mova m0, m3 869 psubw m3, m1 870 pmulhw m3, m6 871 pavgw m1, m7 872 paddw m3, m1 873 mova m1, m0 874 mova m0, m8 875 psrlw m2, 4 876 psrlw m3, 4 877 packuswb m2, m3 878 mova [dstq], m2 879 add dstq, dsmp 880 dec hd 881 jg .hv_w16_loop 882%if ARCH_X86_32 883 mov dstq, dstm 884 add r4, 16 885 movzx hd, r6w 886 add dstq, 16 887 mov srcq, r4 888 mov dstm, dstq 889%else 890 add r4, 16 891 add r7, 16 892 movzx hd, r6b 893 mov srcq, r4 894 mov dstq, r7 895%endif 896 sub r6d, 1<<16 897 jg .hv_w16_loop0 898%if WIN64 899 movaps m8, r4m 900%endif 901 RET 902 903%macro PSHUFB_BILIN_H8 2 ; dst, src 904 %if cpuflag(ssse3) 905 pshufb %1, %2 906 %else 907 psrldq %2, %1, 1 908 punpcklbw %1, %2 909 %endif 910%endmacro 911 912%macro PSHUFB_BILIN_H4 3 ; dst, src, tmp 913 %if cpuflag(ssse3) 914 pshufb %1, %2 915 %else 916 psrldq %2, %1, 1 917 punpckhbw %3, %1, %2 918 punpcklbw %1, %2 919 punpcklqdq %1, %3 920 %endif 921%endmacro 922 923%macro PMADDUBSW 5 ; dst/src1, src2, zero, tmp, reset_zero 924 %if cpuflag(ssse3) 925 pmaddubsw %1, %2 926 %else 927 %if %5 == 1 928 pxor %3, %3 929 %endif 930 punpckhbw %4, %1, %3 931 punpcklbw %1, %1, %3 932 pmaddwd %4, %2 933 pmaddwd %1, %2 934 packssdw %1, %4 935 %endif 936%endmacro 937 938%macro PMULHRSW 5 ; dst, src, tmp, rndval, shift 939 %if cpuflag(ssse3) 940 pmulhrsw %1, %2 941 %else 942 punpckhwd %3, %1, %4 943 punpcklwd %1, %4 944 pmaddwd %3, %2 945 pmaddwd %1, %2 946 psrad %3, %5 947 psrad %1, %5 948 packssdw %1, %3 949 %endif 950%endmacro 951 952%macro PREP_BILIN 0 953%if ARCH_X86_32 954 %define base r6-prep%+SUFFIX 955%else 956 %define base 0 957%endif 958 959cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 960 movifnidn mxyd, r5m ; mx 961 LEA r6, prep%+SUFFIX 962 tzcnt wd, wm 963 movifnidn hd, hm 964 test mxyd, mxyd 965 jnz .h 966 mov mxyd, r6m ; my 967 test mxyd, mxyd 968 jnz .v 969.prep: 970%if notcpuflag(ssse3) 971 add r6, prep_ssse3 - prep_sse2 972 jmp prep_ssse3 973%else 974 movzx wd, word [r6+wq*2+table_offset(prep,)] 975 pxor m4, m4 976 add wq, r6 977 lea stride3q, [strideq*3] 978 jmp wq 979.prep_w4: 980 movd m0, [srcq+strideq*0] 981 movd m1, [srcq+strideq*1] 982 movd m2, [srcq+strideq*2] 983 movd m3, [srcq+stride3q ] 984 lea srcq, [srcq+strideq*4] 985 punpckldq m0, m1 986 punpckldq m2, m3 987 punpcklbw m0, m4 988 punpcklbw m2, m4 989 psllw m0, 4 990 psllw m2, 4 991 mova [tmpq+16*0], m0 992 mova [tmpq+16*1], m2 993 add tmpq, 16*2 994 sub hd, 4 995 jg .prep_w4 996 RET 997.prep_w8: 998 movq m0, [srcq+strideq*0] 999 movq m1, [srcq+strideq*1] 1000 movq m2, [srcq+strideq*2] 1001 movq m3, [srcq+stride3q ] 1002 lea srcq, [srcq+strideq*4] 1003 punpcklbw m0, m4 1004 punpcklbw m1, m4 1005 punpcklbw m2, m4 1006 punpcklbw m3, m4 1007 psllw m0, 4 1008 psllw m1, 4 1009 psllw m2, 4 1010 psllw m3, 4 1011 mova [tmpq+16*0], m0 1012 mova [tmpq+16*1], m1 1013 mova [tmpq+16*2], m2 1014 mova [tmpq+16*3], m3 1015 add tmpq, 16*4 1016 sub hd, 4 1017 jg .prep_w8 1018 RET 1019.prep_w16: 1020 movu m1, [srcq+strideq*0] 1021 movu m3, [srcq+strideq*1] 1022 lea srcq, [srcq+strideq*2] 1023 punpcklbw m0, m1, m4 1024 punpckhbw m1, m4 1025 punpcklbw m2, m3, m4 1026 punpckhbw m3, m4 1027 psllw m0, 4 1028 psllw m1, 4 1029 psllw m2, 4 1030 psllw m3, 4 1031 mova [tmpq+16*0], m0 1032 mova [tmpq+16*1], m1 1033 mova [tmpq+16*2], m2 1034 mova [tmpq+16*3], m3 1035 add tmpq, 16*4 1036 sub hd, 2 1037 jg .prep_w16 1038 RET 1039.prep_w128: 1040 mov r3, -128 1041 jmp .prep_w32_start 1042.prep_w64: 1043 mov r3, -64 1044 jmp .prep_w32_start 1045.prep_w32: 1046 mov r3, -32 1047.prep_w32_start: 1048 sub srcq, r3 1049.prep_w32_vloop: 1050 mov r6, r3 1051.prep_w32_hloop: 1052 movu m1, [srcq+r6+16*0] 1053 movu m3, [srcq+r6+16*1] 1054 punpcklbw m0, m1, m4 1055 punpckhbw m1, m4 1056 punpcklbw m2, m3, m4 1057 punpckhbw m3, m4 1058 psllw m0, 4 1059 psllw m1, 4 1060 psllw m2, 4 1061 psllw m3, 4 1062 mova [tmpq+16*0], m0 1063 mova [tmpq+16*1], m1 1064 mova [tmpq+16*2], m2 1065 mova [tmpq+16*3], m3 1066 add tmpq, 16*4 1067 add r6, 32 1068 jl .prep_w32_hloop 1069 add srcq, strideq 1070 dec hd 1071 jg .prep_w32_vloop 1072 RET 1073%endif 1074.h: 1075 ; 16 * src[x] + (mx * (src[x + 1] - src[x])) 1076 ; = (16 - mx) * src[x] + mx * src[x + 1] 1077%if cpuflag(ssse3) 1078 imul mxyd, 0x00ff00ff 1079 mova m4, [base+bilin_h_shuf8] 1080 add mxyd, 0x00100010 1081%else 1082 imul mxyd, 0xffff 1083 add mxyd, 16 1084%endif 1085 movd m5, mxyd 1086 mov mxyd, r6m ; my 1087 pshufd m5, m5, q0000 1088 test mxyd, mxyd 1089 jnz .hv 1090 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] 1091%if notcpuflag(ssse3) 1092 WIN64_SPILL_XMM 8 1093 pxor m6, m6 1094%endif 1095 add wq, r6 1096 jmp wq 1097.h_w4: 1098%if cpuflag(ssse3) 1099 mova m4, [base+bilin_h_shuf4] 1100%endif 1101 lea stride3q, [strideq*3] 1102.h_w4_loop: 1103 movq m0, [srcq+strideq*0] 1104 movhps m0, [srcq+strideq*1] 1105 movq m1, [srcq+strideq*2] 1106 movhps m1, [srcq+stride3q ] 1107 lea srcq, [srcq+strideq*4] 1108 PSHUFB_BILIN_H4 m0, m4, m2 1109 PMADDUBSW m0, m5, m6, m2, 0 1110 PSHUFB_BILIN_H4 m1, m4, m2 1111 PMADDUBSW m1, m5, m6, m2, 0 1112 mova [tmpq+0 ], m0 1113 mova [tmpq+16], m1 1114 add tmpq, 32 1115 sub hd, 4 1116 jg .h_w4_loop 1117 RET 1118.h_w8: 1119 lea stride3q, [strideq*3] 1120.h_w8_loop: 1121 movu m0, [srcq+strideq*0] 1122 movu m1, [srcq+strideq*1] 1123 movu m2, [srcq+strideq*2] 1124 movu m3, [srcq+stride3q ] 1125 lea srcq, [srcq+strideq*4] 1126 PSHUFB_BILIN_H8 m0, m4 1127 PSHUFB_BILIN_H8 m1, m4 1128 PSHUFB_BILIN_H8 m2, m4 1129 PSHUFB_BILIN_H8 m3, m4 1130 PMADDUBSW m0, m5, m6, m7, 0 1131 PMADDUBSW m1, m5, m6, m7, 0 1132 PMADDUBSW m2, m5, m6, m7, 0 1133 PMADDUBSW m3, m5, m6, m7, 0 1134 mova [tmpq+16*0], m0 1135 mova [tmpq+16*1], m1 1136 mova [tmpq+16*2], m2 1137 mova [tmpq+16*3], m3 1138 add tmpq, 16*4 1139 sub hd, 4 1140 jg .h_w8_loop 1141 RET 1142.h_w16: 1143 movu m0, [srcq+strideq*0+8*0] 1144 movu m1, [srcq+strideq*0+8*1] 1145 movu m2, [srcq+strideq*1+8*0] 1146 movu m3, [srcq+strideq*1+8*1] 1147 lea srcq, [srcq+strideq*2] 1148 PSHUFB_BILIN_H8 m0, m4 1149 PSHUFB_BILIN_H8 m1, m4 1150 PSHUFB_BILIN_H8 m2, m4 1151 PSHUFB_BILIN_H8 m3, m4 1152 PMADDUBSW m0, m5, m6, m7, 0 1153 PMADDUBSW m1, m5, m6, m7, 0 1154 PMADDUBSW m2, m5, m6, m7, 0 1155 PMADDUBSW m3, m5, m6, m7, 0 1156 mova [tmpq+16*0], m0 1157 mova [tmpq+16*1], m1 1158 mova [tmpq+16*2], m2 1159 mova [tmpq+16*3], m3 1160 add tmpq, 16*4 1161 sub hd, 2 1162 jg .h_w16 1163 RET 1164.h_w128: 1165 mov r3, -128 1166 jmp .h_w32_start 1167.h_w64: 1168 mov r3, -64 1169 jmp .h_w32_start 1170.h_w32: 1171 mov r3, -32 1172.h_w32_start: 1173 sub srcq, r3 1174.h_w32_vloop: 1175 mov r6, r3 1176.h_w32_hloop: 1177 movu m0, [srcq+r6+8*0] 1178 movu m1, [srcq+r6+8*1] 1179 movu m2, [srcq+r6+8*2] 1180 movu m3, [srcq+r6+8*3] 1181 PSHUFB_BILIN_H8 m0, m4 1182 PSHUFB_BILIN_H8 m1, m4 1183 PSHUFB_BILIN_H8 m2, m4 1184 PSHUFB_BILIN_H8 m3, m4 1185 PMADDUBSW m0, m5, m6, m7, 0 1186 PMADDUBSW m1, m5, m6, m7, 0 1187 PMADDUBSW m2, m5, m6, m7, 0 1188 PMADDUBSW m3, m5, m6, m7, 0 1189 mova [tmpq+16*0], m0 1190 mova [tmpq+16*1], m1 1191 mova [tmpq+16*2], m2 1192 mova [tmpq+16*3], m3 1193 add tmpq, 16*4 1194 add r6, 32 1195 jl .h_w32_hloop 1196 add srcq, strideq 1197 dec hd 1198 jg .h_w32_vloop 1199 RET 1200.v: 1201%if notcpuflag(ssse3) 1202 WIN64_SPILL_XMM 8 1203%endif 1204 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] 1205%if cpuflag(ssse3) 1206 imul mxyd, 0x00ff00ff 1207 add mxyd, 0x00100010 1208%else 1209 imul mxyd, 0xffff 1210 pxor m6, m6 1211 add mxyd, 16 1212%endif 1213 add wq, r6 1214 lea stride3q, [strideq*3] 1215 movd m5, mxyd 1216 pshufd m5, m5, q0000 1217 jmp wq 1218.v_w4: 1219 movd m0, [srcq+strideq*0] 1220.v_w4_loop: 1221 movd m1, [srcq+strideq*1] 1222 movd m2, [srcq+strideq*2] 1223 movd m3, [srcq+stride3q ] 1224 lea srcq, [srcq+strideq*4] 1225 punpckldq m0, m1 1226 punpckldq m1, m2 1227 punpcklbw m0, m1 ; 01 12 1228 PMADDUBSW m0, m5, m6, m7, 0 1229 mova [tmpq+16*0], m0 1230 movd m0, [srcq+strideq*0] 1231 punpckldq m2, m3 1232 punpckldq m3, m0 1233 punpcklbw m2, m3 ; 23 34 1234 PMADDUBSW m2, m5, m6, m7, 0 1235 mova [tmpq+16*1], m2 1236 add tmpq, 16*2 1237 sub hd, 4 1238 jg .v_w4_loop 1239 RET 1240.v_w8: 1241 movq m0, [srcq+strideq*0] 1242.v_w8_loop: 1243 movq m1, [srcq+strideq*1] 1244 movq m2, [srcq+strideq*2] 1245 movq m3, [srcq+stride3q ] 1246 lea srcq, [srcq+strideq*4] 1247 punpcklbw m0, m1 ; 01 1248 punpcklbw m1, m2 ; 12 1249 PMADDUBSW m0, m5, m6, m7, 0 1250 PMADDUBSW m1, m5, m6, m7, 0 1251 mova [tmpq+16*0], m0 1252 movq m0, [srcq+strideq*0] 1253 punpcklbw m2, m3 ; 23 1254 punpcklbw m3, m0 ; 34 1255 PMADDUBSW m2, m5, m6, m7, 0 1256 mova [tmpq+16*1], m1 1257 PMADDUBSW m3, m5, m6, m7, 0 1258 mova [tmpq+16*2], m2 1259 mova [tmpq+16*3], m3 1260 add tmpq, 16*4 1261 sub hd, 4 1262 jg .v_w8_loop 1263 RET 1264.v_w16: 1265 movu m0, [srcq+strideq*0] 1266.v_w16_loop: 1267 movu m1, [srcq+strideq*1] 1268 movu m2, [srcq+strideq*2] 1269 movu m3, [srcq+stride3q ] 1270 lea srcq, [srcq+strideq*4] 1271 punpcklbw m4, m0, m1 1272 punpckhbw m0, m1 1273 PMADDUBSW m4, m5, m6, m7, 0 1274 PMADDUBSW m0, m5, m6, m7, 0 1275 mova [tmpq+16*0], m4 1276 punpcklbw m4, m1, m2 1277 punpckhbw m1, m2 1278 PMADDUBSW m4, m5, m6, m7, 0 1279 mova [tmpq+16*1], m0 1280 movu m0, [srcq+strideq*0] 1281 PMADDUBSW m1, m5, m6, m7, 0 1282 mova [tmpq+16*2], m4 1283 punpcklbw m4, m2, m3 1284 punpckhbw m2, m3 1285 PMADDUBSW m4, m5, m6, m7, 0 1286 mova [tmpq+16*3], m1 1287 PMADDUBSW m2, m5, m6, m7, 0 1288 mova [tmpq+16*4], m4 1289 punpcklbw m4, m3, m0 1290 punpckhbw m3, m0 1291 PMADDUBSW m4, m5, m6, m7, 0 1292 mova [tmpq+16*5], m2 1293 PMADDUBSW m3, m5, m6, m7, 0 1294 mova [tmpq+16*6], m4 1295 mova [tmpq+16*7], m3 1296 add tmpq, 16*8 1297 sub hd, 4 1298 jg .v_w16_loop 1299 RET 1300.v_w128: 1301 lea r3d, [hq+(3<<8)] 1302 mov r6d, 256 1303 jmp .v_w32_start 1304.v_w64: 1305 lea r3d, [hq+(1<<8)] 1306 mov r6d, 128 1307 jmp .v_w32_start 1308.v_w32: 1309 xor r3d, r3d 1310 mov r6d, 64 1311.v_w32_start: 1312%if ARCH_X86_64 1313 %if WIN64 1314 PUSH r7 1315 %endif 1316 mov r7, tmpq 1317%endif 1318 mov r5, srcq 1319.v_w32_hloop: 1320 movu m0, [srcq+strideq*0+16*0] 1321 movu m1, [srcq+strideq*0+16*1] 1322.v_w32_vloop: 1323 movu m2, [srcq+strideq*1+16*0] 1324 movu m3, [srcq+strideq*1+16*1] 1325 lea srcq, [srcq+strideq*2] 1326 punpcklbw m4, m0, m2 1327 punpckhbw m0, m2 1328 PMADDUBSW m4, m5, m6, m7, 0 1329 PMADDUBSW m0, m5, m6, m7, 0 1330 mova [tmpq+16*0], m4 1331 mova [tmpq+16*1], m0 1332 movu m0, [srcq+strideq*0+16*0] 1333 punpcklbw m4, m1, m3 1334 punpckhbw m1, m3 1335 PMADDUBSW m4, m5, m6, m7, 0 1336 PMADDUBSW m1, m5, m6, m7, 0 1337 mova [tmpq+16*2], m4 1338 mova [tmpq+16*3], m1 1339 movu m1, [srcq+strideq*0+16*1] 1340 add tmpq, r6 1341 punpcklbw m4, m2, m0 1342 punpckhbw m2, m0 1343 PMADDUBSW m4, m5, m6, m7, 0 1344 PMADDUBSW m2, m5, m6, m7, 0 1345 mova [tmpq+16*0], m4 1346 mova [tmpq+16*1], m2 1347 punpcklbw m4, m3, m1 1348 punpckhbw m3, m1 1349 PMADDUBSW m4, m5, m6, m7, 0 1350 PMADDUBSW m3, m5, m6, m7, 0 1351 mova [tmpq+16*2], m4 1352 mova [tmpq+16*3], m3 1353 add tmpq, r6 1354 sub hd, 2 1355 jg .v_w32_vloop 1356 add r5, 32 1357 movzx hd, r3b 1358 mov srcq, r5 1359%if ARCH_X86_64 1360 add r7, 16*4 1361 mov tmpq, r7 1362%else 1363 mov tmpq, tmpmp 1364 add tmpq, 16*4 1365 mov tmpmp, tmpq 1366%endif 1367 sub r3d, 1<<8 1368 jg .v_w32_hloop 1369%if WIN64 1370 POP r7 1371%endif 1372 RET 1373.hv: 1374 ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 1375 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) 1376 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] 1377%if cpuflag(ssse3) 1378 imul mxyd, 0x08000800 1379 WIN64_SPILL_XMM 8 1380%else 1381 or mxyd, 1<<16 1382 WIN64_SPILL_XMM 9 1383 %if ARCH_X86_64 1384 mova m8, [base+pw_8] 1385 %else 1386 %define m8 [base+pw_8] 1387 %endif 1388 pxor m7, m7 1389%endif 1390 movd m6, mxyd 1391 add wq, r6 1392 pshufd m6, m6, q0000 1393 jmp wq 1394.hv_w4: 1395%if cpuflag(ssse3) 1396 mova m4, [base+bilin_h_shuf4] 1397 movddup m0, [srcq+strideq*0] 1398%else 1399 movhps m0, [srcq+strideq*0] 1400%endif 1401 lea r3, [strideq*3] 1402 PSHUFB_BILIN_H4 m0, m4, m3 1403 PMADDUBSW m0, m5, m7, m4, 0 ; _ 0 1404.hv_w4_loop: 1405 movq m1, [srcq+strideq*1] 1406 movhps m1, [srcq+strideq*2] 1407 movq m2, [srcq+r3 ] 1408 lea srcq, [srcq+strideq*4] 1409 movhps m2, [srcq+strideq*0] 1410 PSHUFB_BILIN_H4 m1, m4, m3 1411 PSHUFB_BILIN_H4 m2, m4, m3 1412 PMADDUBSW m1, m5, m7, m4, 0 ; 1 2 1413 PMADDUBSW m2, m5, m7, m4, 0 ; 3 4 1414 shufpd m0, m1, 0x01 ; 0 1 1415 shufpd m3, m1, m2, 0x01 ; 2 3 1416 psubw m1, m0 1417 PMULHRSW m1, m6, m4, m8, 4 1418 paddw m1, m0 1419 mova m0, m2 1420 psubw m2, m3 1421 PMULHRSW m2, m6, m4, m8, 4 1422 paddw m2, m3 1423 mova [tmpq+16*0], m1 1424 mova [tmpq+16*1], m2 1425 add tmpq, 32 1426 sub hd, 4 1427 jg .hv_w4_loop 1428 RET 1429.hv_w8: 1430 movu m0, [srcq+strideq*0] 1431 PSHUFB_BILIN_H8 m0, m4 1432 PMADDUBSW m0, m5, m7, m4, 0 ; 0 1433.hv_w8_loop: 1434 movu m1, [srcq+strideq*1] 1435 lea srcq, [srcq+strideq*2] 1436 movu m2, [srcq+strideq*0] 1437 PSHUFB_BILIN_H8 m1, m4 1438 PSHUFB_BILIN_H8 m2, m4 1439 PMADDUBSW m1, m5, m7, m4, 0 ; 1 1440 PMADDUBSW m2, m5, m7, m4, 0 ; 2 1441 psubw m3, m1, m0 1442 PMULHRSW m3, m6, m4, m8, 4 1443 paddw m3, m0 1444 mova m0, m2 1445 psubw m2, m1 1446 PMULHRSW m2, m6, m4, m8, 4 1447 paddw m2, m1 1448 mova [tmpq+16*0], m3 1449 mova [tmpq+16*1], m2 1450 add tmpq, 16*2 1451 sub hd, 2 1452 jg .hv_w8_loop 1453 RET 1454.hv_w128: 1455 lea r3d, [hq+(7<<8)] 1456 mov r5d, 256 1457 jmp .hv_w16_start 1458.hv_w64: 1459 lea r3d, [hq+(3<<8)] 1460 mov r5d, 128 1461 jmp .hv_w16_start 1462.hv_w32: 1463 lea r3d, [hq+(1<<8)] 1464 mov r5d, 64 1465 jmp .hv_w16_start 1466.hv_w16: 1467 xor r3d, r3d 1468 mov r5d, 32 1469.hv_w16_start: 1470%if ARCH_X86_64 || cpuflag(ssse3) 1471 mov r6, srcq 1472%endif 1473%if ARCH_X86_64 1474 %if WIN64 1475 PUSH r7 1476 %endif 1477 mov r7, tmpq 1478%endif 1479.hv_w16_hloop: 1480 movu m0, [srcq+strideq*0+8*0] 1481 movu m1, [srcq+strideq*0+8*1] 1482 PSHUFB_BILIN_H8 m0, m4 1483 PSHUFB_BILIN_H8 m1, m4 1484 PMADDUBSW m0, m5, m7, m4, 0 ; 0a 1485 PMADDUBSW m1, m5, m7, m4, 0 ; 0b 1486.hv_w16_vloop: 1487 movu m2, [srcq+strideq*1+8*0] 1488 PSHUFB_BILIN_H8 m2, m4 1489 PMADDUBSW m2, m5, m7, m4, 0 ; 1a 1490 psubw m3, m2, m0 1491 PMULHRSW m3, m6, m4, m8, 4 1492 paddw m3, m0 1493 mova [tmpq+16*0], m3 1494 movu m3, [srcq+strideq*1+8*1] 1495 lea srcq, [srcq+strideq*2] 1496 PSHUFB_BILIN_H8 m3, m4 1497 PMADDUBSW m3, m5, m7, m4, 0 ; 1b 1498 psubw m0, m3, m1 1499 PMULHRSW m0, m6, m4, m8, 4 1500 paddw m0, m1 1501 mova [tmpq+16*1], m0 1502 add tmpq, r5 1503 movu m0, [srcq+strideq*0+8*0] 1504 PSHUFB_BILIN_H8 m0, m4 1505 PMADDUBSW m0, m5, m7, m4, 0 ; 2a 1506 psubw m1, m0, m2 1507 PMULHRSW m1, m6, m4, m8, 4 1508 paddw m1, m2 1509 mova [tmpq+16*0], m1 1510 movu m1, [srcq+strideq*0+8*1] 1511 PSHUFB_BILIN_H8 m1, m4 1512 PMADDUBSW m1, m5, m7, m4, 0 ; 2b 1513 psubw m2, m1, m3 1514 PMULHRSW m2, m6, m4, m8, 4 1515 paddw m2, m3 1516 mova [tmpq+16*1], m2 1517 add tmpq, r5 1518 sub hd, 2 1519 jg .hv_w16_vloop 1520 movzx hd, r3b 1521%if ARCH_X86_64 1522 add r6, 16 1523 add r7, 2*16 1524 mov srcq, r6 1525 mov tmpq, r7 1526%elif cpuflag(ssse3) 1527 mov tmpq, tmpm 1528 add r6, 16 1529 add tmpq, 2*16 1530 mov srcq, r6 1531 mov tmpm, tmpq 1532%else 1533 mov srcq, srcm 1534 mov tmpq, tmpm 1535 add srcq, 16 1536 add tmpq, 2*16 1537 mov srcm, srcq 1538 mov tmpm, tmpq 1539%endif 1540 sub r3d, 1<<8 1541 jg .hv_w16_hloop 1542%if WIN64 1543 POP r7 1544%endif 1545 RET 1546%endmacro 1547 1548; int8_t subpel_filters[5][15][8] 1549%assign FILTER_REGULAR (0*15 << 16) | 3*15 1550%assign FILTER_SMOOTH (1*15 << 16) | 4*15 1551%assign FILTER_SHARP (2*15 << 16) | 3*15 1552 1553%macro FN 4 ; prefix, type, type_h, type_v 1554cglobal %1_%2_8bpc 1555 mov t0d, FILTER_%3 1556%ifidn %3, %4 1557 mov t1d, t0d 1558%else 1559 mov t1d, FILTER_%4 1560%endif 1561%ifnidn %2, regular ; skip the jump in the last filter 1562 jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) 1563%endif 1564%endmacro 1565 1566%if ARCH_X86_32 1567DECLARE_REG_TMP 1, 2 1568%elif WIN64 1569DECLARE_REG_TMP 4, 5 1570%else 1571DECLARE_REG_TMP 7, 8 1572%endif 1573 1574FN put_8tap, sharp, SHARP, SHARP 1575FN put_8tap, sharp_smooth, SHARP, SMOOTH 1576FN put_8tap, smooth_sharp, SMOOTH, SHARP 1577FN put_8tap, smooth, SMOOTH, SMOOTH 1578FN put_8tap, sharp_regular, SHARP, REGULAR 1579FN put_8tap, regular_sharp, REGULAR, SHARP 1580FN put_8tap, smooth_regular, SMOOTH, REGULAR 1581FN put_8tap, regular_smooth, REGULAR, SMOOTH 1582FN put_8tap, regular, REGULAR, REGULAR 1583 1584%if ARCH_X86_32 1585 %define base_reg r1 1586 %define base base_reg-put_ssse3 1587%else 1588 %define base_reg r8 1589 %define base 0 1590%endif 1591 1592cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 1593 imul mxd, mxm, 0x010101 1594 add mxd, t0d ; 8tap_h, mx, 4tap_h 1595%if ARCH_X86_64 1596 imul myd, mym, 0x010101 1597 add myd, t1d ; 8tap_v, my, 4tap_v 1598%else 1599 imul ssd, mym, 0x010101 1600 add ssd, t1d ; 8tap_v, my, 4tap_v 1601 mov srcq, srcm 1602%endif 1603 mov wd, wm 1604 movifnidn hd, hm 1605 LEA base_reg, put_ssse3 1606 test mxd, 0xf00 1607 jnz .h 1608%if ARCH_X86_32 1609 test ssd, 0xf00 1610%else 1611 test myd, 0xf00 1612%endif 1613 jnz .v 1614 tzcnt wd, wd 1615 movzx wd, word [base_reg+wq*2+table_offset(put,)] 1616 add wq, base_reg 1617; put_bilin mangling jump 1618 movifnidn dsq, dsmp 1619 movifnidn ssq, ssmp 1620%if WIN64 1621 pop r8 1622%endif 1623 lea r6, [ssq*3] 1624 jmp wq 1625.h: 1626%if ARCH_X86_32 1627 test ssd, 0xf00 1628%else 1629 test myd, 0xf00 1630%endif 1631 jnz .hv 1632 movifnidn ssq, ssmp 1633 WIN64_SPILL_XMM 12 1634 cmp wd, 4 1635 jl .h_w2 1636 je .h_w4 1637 tzcnt wd, wd 1638%if ARCH_X86_64 1639 mova m10, [base+subpel_h_shufA] 1640 mova m11, [base+subpel_h_shufB] 1641 mova m9, [base+subpel_h_shufC] 1642%endif 1643 shr mxd, 16 1644 sub srcq, 3 1645 movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)] 1646 movq m6, [base_reg+mxq*8+subpel_filters-put_ssse3] 1647 mova m7, [base+pw_34] ; 2 + (8 << 2) 1648 pshufd m5, m6, q0000 1649 pshufd m6, m6, q1111 1650 add wq, base_reg 1651 jmp wq 1652.h_w2: 1653%if ARCH_X86_32 1654 and mxd, 0x7f 1655%else 1656 movzx mxd, mxb 1657%endif 1658 dec srcq 1659 mova m4, [base+subpel_h_shuf4] 1660 movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] 1661 mova m5, [base+pw_34] ; 2 + (8 << 2) 1662 pshufd m3, m3, q0000 1663 movifnidn dsq, dsmp 1664.h_w2_loop: 1665 movq m0, [srcq+ssq*0] 1666 movhps m0, [srcq+ssq*1] 1667 lea srcq, [srcq+ssq*2] 1668 pshufb m0, m4 1669 pmaddubsw m0, m3 1670 phaddw m0, m0 1671 paddw m0, m5 ; pw34 1672 psraw m0, 6 1673 packuswb m0, m0 1674 movd r6d, m0 1675 mov [dstq+dsq*0], r6w 1676 shr r6d, 16 1677 mov [dstq+dsq*1], r6w 1678 lea dstq, [dstq+dsq*2] 1679 sub hd, 2 1680 jg .h_w2_loop 1681 RET 1682.h_w4: 1683%if ARCH_X86_32 1684 and mxd, 0x7f 1685%else 1686 movzx mxd, mxb 1687%endif 1688 dec srcq 1689 movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] 1690 mova m6, [base+subpel_h_shufA] 1691 mova m5, [base+pw_34] ; 2 + (8 << 2) 1692 pshufd m3, m3, q0000 1693 movifnidn dsq, dsmp 1694.h_w4_loop: 1695 movq m0, [srcq+ssq*0] ; 1 1696 movq m1, [srcq+ssq*1] ; 2 1697 lea srcq, [srcq+ssq*2] 1698 pshufb m0, m6 ; subpel_h_shufA 1699 pshufb m1, m6 ; subpel_h_shufA 1700 pmaddubsw m0, m3 ; subpel_filters 1701 pmaddubsw m1, m3 ; subpel_filters 1702 phaddw m0, m1 1703 paddw m0, m5 ; pw34 1704 psraw m0, 6 1705 packuswb m0, m0 1706 movd [dstq+dsq*0], m0 1707 psrlq m0, 32 1708 movd [dstq+dsq*1], m0 1709 lea dstq, [dstq+dsq*2] 1710 sub hd, 2 1711 jg .h_w4_loop 1712 RET 1713%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] 1714 %if ARCH_X86_32 1715 pshufb %2, %1, [base+subpel_h_shufB] 1716 pshufb %3, %1, [base+subpel_h_shufC] 1717 pshufb %1, [base+subpel_h_shufA] 1718 %else 1719 pshufb %2, %1, m11; subpel_h_shufB 1720 pshufb %3, %1, m9 ; subpel_h_shufC 1721 pshufb %1, m10 ; subpel_h_shufA 1722 %endif 1723 pmaddubsw %4, %2, m5 ; subpel +0 B0 1724 pmaddubsw %2, m6 ; subpel +4 B4 1725 pmaddubsw %3, m6 ; C4 1726 pmaddubsw %1, m5 ; A0 1727 paddw %3, %4 ; C4+B0 1728 paddw %1, %2 ; A0+B4 1729 phaddw %1, %3 1730 paddw %1, m7 ; pw34 1731 psraw %1, 6 1732%endmacro 1733.h_w8: 1734 movu m0, [srcq+ssq*0] 1735 movu m1, [srcq+ssq*1] 1736 lea srcq, [srcq+ssq*2] 1737 PUT_8TAP_H m0, m2, m3, m4 1738 PUT_8TAP_H m1, m2, m3, m4 1739 packuswb m0, m1 1740%if ARCH_X86_32 1741 movq [dstq], m0 1742 add dstq, dsm 1743 movhps [dstq], m0 1744 add dstq, dsm 1745%else 1746 movq [dstq+dsq*0], m0 1747 movhps [dstq+dsq*1], m0 1748 lea dstq, [dstq+dsq*2] 1749%endif 1750 sub hd, 2 1751 jg .h_w8 1752 RET 1753.h_w128: 1754 mov r4, -16*7 1755 jmp .h_w16_start 1756.h_w64: 1757 mov r4, -16*3 1758 jmp .h_w16_start 1759.h_w32: 1760 mov r4, -16*1 1761 jmp .h_w16_start 1762.h_w16: 1763 xor r4d, r4d 1764.h_w16_start: 1765 sub srcq, r4 1766 sub dstq, r4 1767.h_w16_loop_v: 1768 mov r6, r4 1769.h_w16_loop_h: 1770 movu m0, [srcq+r6+8*0] 1771 movu m1, [srcq+r6+8*1] 1772 PUT_8TAP_H m0, m2, m3, m4 1773 PUT_8TAP_H m1, m2, m3, m4 1774 packuswb m0, m1 1775 mova [dstq+r6], m0 1776 add r6, 16 1777 jle .h_w16_loop_h 1778 add srcq, ssq 1779 add dstq, dsmp 1780 dec hd 1781 jg .h_w16_loop_v 1782 RET 1783.v: 1784%if ARCH_X86_32 1785 movzx mxd, ssb 1786 shr ssd, 16 1787 cmp hd, 6 1788 cmovs ssd, mxd 1789 movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] 1790%else 1791 WIN64_SPILL_XMM 16 1792 movzx mxd, myb 1793 shr myd, 16 1794 cmp hd, 6 1795 cmovs myd, mxd 1796 movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] 1797%endif 1798 tzcnt r6d, wd 1799 movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)] 1800 punpcklwd m0, m0 1801 mova m7, [base+pw_512] 1802 add r6, base_reg 1803%if ARCH_X86_32 1804 %define subpel0 [rsp+mmsize*0] 1805 %define subpel1 [rsp+mmsize*1] 1806 %define subpel2 [rsp+mmsize*2] 1807 %define subpel3 [rsp+mmsize*3] 1808%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed 1809 ALLOC_STACK -16*4 1810%assign regs_used 7 1811 pshufd m1, m0, q0000 1812 mova subpel0, m1 1813 pshufd m1, m0, q1111 1814 mova subpel1, m1 1815 pshufd m1, m0, q2222 1816 mova subpel2, m1 1817 pshufd m1, m0, q3333 1818 mova subpel3, m1 1819 mov ssq, [rstk+stack_offset+gprsize*4] 1820 lea ssq, [ssq*3] 1821 sub srcq, ssq 1822 mov ssq, [rstk+stack_offset+gprsize*4] 1823 mov dsq, [rstk+stack_offset+gprsize*2] 1824%else 1825 %define subpel0 m8 1826 %define subpel1 m9 1827 %define subpel2 m10 1828 %define subpel3 m11 1829 lea ss3q, [ssq*3] 1830 pshufd m8, m0, q0000 1831 sub srcq, ss3q 1832 pshufd m9, m0, q1111 1833 pshufd m10, m0, q2222 1834 pshufd m11, m0, q3333 1835%endif 1836 jmp r6 1837.v_w2: 1838 movd m1, [srcq+ssq*0] 1839 movd m0, [srcq+ssq*1] 1840%if ARCH_X86_32 1841 lea srcq, [srcq+ssq*2] 1842 movd m2, [srcq+ssq*0] 1843 movd m5, [srcq+ssq*1] 1844 lea srcq, [srcq+ssq*2] 1845 movd m3, [srcq+ssq*0] 1846 movd m4, [srcq+ssq*1] 1847 lea srcq, [srcq+ssq*2] 1848%else 1849 movd m2, [srcq+ssq*2] 1850 add srcq, ss3q 1851 movd m5, [srcq+ssq*0] 1852 movd m3, [srcq+ssq*1] 1853 movd m4, [srcq+ssq*2] 1854 add srcq, ss3q 1855%endif 1856 punpcklwd m1, m0 ; 0 1 1857 punpcklwd m0, m2 ; 1 2 1858 punpcklbw m1, m0 ; 01 12 1859 movd m0, [srcq+ssq*0] 1860 punpcklwd m2, m5 ; 2 3 1861 punpcklwd m5, m3 ; 3 4 1862 punpcklwd m3, m4 ; 4 5 1863 punpcklwd m4, m0 ; 5 6 1864 punpcklbw m2, m5 ; 23 34 1865 punpcklbw m3, m4 ; 45 56 1866.v_w2_loop: 1867 movd m4, [srcq+ssq*1] 1868 lea srcq, [srcq+ssq*2] 1869 pmaddubsw m5, m1, subpel0 ; a0 b0 1870 mova m1, m2 1871 pmaddubsw m2, subpel1 ; a1 b1 1872 paddw m5, m2 1873 mova m2, m3 1874 pmaddubsw m3, subpel2 ; a2 b2 1875 paddw m5, m3 1876 punpcklwd m3, m0, m4 ; 6 7 1877 movd m0, [srcq+ssq*0] 1878 punpcklwd m4, m0 ; 7 8 1879 punpcklbw m3, m4 ; 67 78 1880 pmaddubsw m4, m3, subpel3 ; a3 b3 1881 paddw m5, m4 1882 pmulhrsw m5, m7 1883 packuswb m5, m5 1884 movd r6d, m5 1885 mov [dstq+dsq*0], r6w 1886 shr r6d, 16 1887 mov [dstq+dsq*1], r6w 1888 lea dstq, [dstq+dsq*2] 1889 sub hd, 2 1890 jg .v_w2_loop 1891 RET 1892.v_w4: 1893%if ARCH_X86_32 1894.v_w8: 1895.v_w16: 1896.v_w32: 1897.v_w64: 1898.v_w128: 1899 shl wd, 14 1900%if STACK_ALIGNMENT < 16 1901 %define dstm [rsp+mmsize*4+gprsize] 1902 mov dstm, dstq 1903%endif 1904 lea r6d, [hq+wq-(1<<16)] 1905 mov r4, srcq 1906.v_w4_loop0: 1907%endif 1908 movd m1, [srcq+ssq*0] 1909 movd m0, [srcq+ssq*1] 1910%if ARCH_X86_32 1911 lea srcq, [srcq+ssq*2] 1912 movd m2, [srcq+ssq*0] 1913 movd m5, [srcq+ssq*1] 1914 lea srcq, [srcq+ssq*2] 1915 movd m3, [srcq+ssq*0] 1916 movd m4, [srcq+ssq*1] 1917 lea srcq, [srcq+ssq*2] 1918%else 1919 movd m2, [srcq+ssq*2] 1920 add srcq, ss3q 1921 movd m5, [srcq+ssq*0] 1922 movd m3, [srcq+ssq*1] 1923 movd m4, [srcq+ssq*2] 1924 add srcq, ss3q 1925%endif 1926 punpckldq m1, m0 ; 0 1 1927 punpckldq m0, m2 ; 1 2 1928 punpcklbw m1, m0 ; 01 12 1929 movd m0, [srcq+ssq*0] 1930 punpckldq m2, m5 ; 2 3 1931 punpckldq m5, m3 ; 3 4 1932 punpckldq m3, m4 ; 4 5 1933 punpckldq m4, m0 ; 5 6 1934 punpcklbw m2, m5 ; 23 34 1935 punpcklbw m3, m4 ; 45 56 1936.v_w4_loop: 1937 movd m4, [srcq+ssq*1] 1938 lea srcq, [srcq+ssq*2] 1939 pmaddubsw m5, m1, subpel0 ; a0 b0 1940 mova m1, m2 1941 pmaddubsw m2, subpel1 ; a1 b1 1942 paddw m5, m2 1943 mova m2, m3 1944 pmaddubsw m3, subpel2 ; a2 b2 1945 paddw m5, m3 1946 punpckldq m3, m0, m4 ; 6 7 _ _ 1947 movd m0, [srcq+ssq*0] 1948 punpckldq m4, m0 ; 7 8 _ _ 1949 punpcklbw m3, m4 ; 67 78 1950 pmaddubsw m4, m3, subpel3 ; a3 b3 1951 paddw m5, m4 1952 pmulhrsw m5, m7 1953 packuswb m5, m5 1954 movd [dstq+dsq*0], m5 1955 psrlq m5, 32 1956 movd [dstq+dsq*1], m5 1957 lea dstq, [dstq+dsq*2] 1958 sub hd, 2 1959 jg .v_w4_loop 1960%if ARCH_X86_32 1961 mov dstq, dstm 1962 add r4, 4 1963 movzx hd, r6w 1964 add dstq, 4 1965 mov srcq, r4 1966 mov dstm, dstq 1967 sub r6d, 1<<16 1968 jg .v_w4_loop0 1969%endif 1970 RET 1971%if ARCH_X86_64 1972.v_w8: 1973.v_w16: 1974.v_w32: 1975.v_w64: 1976.v_w128: 1977 lea r6d, [wq*8-64] 1978 mov r4, srcq 1979 mov r7, dstq 1980 lea r6d, [hq+r6*4] 1981.v_w8_loop0: 1982 movq m1, [srcq+ssq*0] 1983 movq m2, [srcq+ssq*1] 1984 movq m3, [srcq+ssq*2] 1985 add srcq, ss3q 1986 movq m4, [srcq+ssq*0] 1987 movq m5, [srcq+ssq*1] 1988 movq m6, [srcq+ssq*2] 1989 add srcq, ss3q 1990 movq m0, [srcq+ssq*0] 1991 punpcklbw m1, m2 ; 01 1992 punpcklbw m2, m3 ; 12 1993 punpcklbw m3, m4 ; 23 1994 punpcklbw m4, m5 ; 34 1995 punpcklbw m5, m6 ; 45 1996 punpcklbw m6, m0 ; 56 1997.v_w8_loop: 1998 movq m13, [srcq+ssq*1] 1999 lea srcq, [srcq+ssq*2] 2000 pmaddubsw m14, m1, subpel0 ; a0 2001 mova m1, m3 2002 pmaddubsw m15, m2, subpel0 ; b0 2003 mova m2, m4 2004 pmaddubsw m3, subpel1 ; a1 2005 mova m12, m0 2006 pmaddubsw m4, subpel1 ; b1 2007 movq m0, [srcq+ssq*0] 2008 paddw m14, m3 2009 paddw m15, m4 2010 mova m3, m5 2011 pmaddubsw m5, subpel2 ; a2 2012 mova m4, m6 2013 pmaddubsw m6, subpel2 ; b2 2014 punpcklbw m12, m13 ; 67 2015 punpcklbw m13, m0 ; 78 2016 paddw m14, m5 2017 mova m5, m12 2018 pmaddubsw m12, subpel3 ; a3 2019 paddw m15, m6 2020 mova m6, m13 2021 pmaddubsw m13, subpel3 ; b3 2022 paddw m14, m12 2023 paddw m15, m13 2024 pmulhrsw m14, m7 2025 pmulhrsw m15, m7 2026 packuswb m14, m15 2027 movq [dstq+dsq*0], m14 2028 movhps [dstq+dsq*1], m14 2029 lea dstq, [dstq+dsq*2] 2030 sub hd, 2 2031 jg .v_w8_loop 2032 add r4, 8 2033 add r7, 8 2034 movzx hd, r6b 2035 mov srcq, r4 2036 mov dstq, r7 2037 sub r6d, 1<<8 2038 jg .v_w8_loop0 2039 RET 2040%endif ;ARCH_X86_64 2041%undef subpel0 2042%undef subpel1 2043%undef subpel2 2044%undef subpel3 2045.hv: 2046 RESET_STACK_STATE 2047 cmp wd, 4 2048 jg .hv_w8 2049%if ARCH_X86_32 2050 and mxd, 0x7f 2051%else 2052 movzx mxd, mxb 2053%endif 2054 dec srcq 2055 movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2] 2056%if ARCH_X86_32 2057 movzx mxd, ssb 2058 shr ssd, 16 2059 cmp hd, 6 2060 cmovs ssd, mxd 2061 movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] 2062 mov ssq, ssmp 2063 lea r6, [ssq*3] 2064 sub srcq, r6 2065 %define base_reg r6 2066 mov r6, r1; use as new base 2067 %assign regs_used 2 2068 ALLOC_STACK -mmsize*14 2069 %assign regs_used 7 2070 mov dsq, [rstk+stack_offset+gprsize*2] 2071 %define subpelv0 [rsp+mmsize*0] 2072 %define subpelv1 [rsp+mmsize*1] 2073 %define subpelv2 [rsp+mmsize*2] 2074 %define subpelv3 [rsp+mmsize*3] 2075 punpcklbw m0, m0 2076 psraw m0, 8 ; sign-extend 2077 pshufd m6, m0, q0000 2078 mova subpelv0, m6 2079 pshufd m6, m0, q1111 2080 mova subpelv1, m6 2081 pshufd m6, m0, q2222 2082 mova subpelv2, m6 2083 pshufd m6, m0, q3333 2084 mova subpelv3, m6 2085%else 2086 movzx mxd, myb 2087 shr myd, 16 2088 cmp hd, 6 2089 cmovs myd, mxd 2090 movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] 2091 ALLOC_STACK mmsize*14, 14 2092 lea ss3q, [ssq*3] 2093 sub srcq, ss3q 2094 %define subpelv0 m10 2095 %define subpelv1 m11 2096 %define subpelv2 m12 2097 %define subpelv3 m13 2098 punpcklbw m0, m0 2099 psraw m0, 8 ; sign-extend 2100 mova m8, [base+pw_8192] 2101 mova m9, [base+pd_512] 2102 pshufd m10, m0, q0000 2103 pshufd m11, m0, q1111 2104 pshufd m12, m0, q2222 2105 pshufd m13, m0, q3333 2106%endif 2107 pshufd m7, m1, q0000 2108 cmp wd, 4 2109 je .hv_w4 2110.hv_w2: 2111 mova m6, [base+subpel_h_shuf4] 2112 movq m2, [srcq+ssq*0] ; 0 2113 movhps m2, [srcq+ssq*1] ; 0 _ 1 2114%if ARCH_X86_32 2115 %define w8192reg [base+pw_8192] 2116 %define d512reg [base+pd_512] 2117 lea srcq, [srcq+ssq*2] 2118 movq m0, [srcq+ssq*0] ; 2 2119 movhps m0, [srcq+ssq*1] ; 2 _ 3 2120 lea srcq, [srcq+ssq*2] 2121%else 2122 %define w8192reg m8 2123 %define d512reg m9 2124 movq m0, [srcq+ssq*2] ; 2 2125 add srcq, ss3q 2126 movhps m0, [srcq+ssq*0] ; 2 _ 3 2127%endif 2128 pshufb m2, m6 ; 0 ~ 1 ~ 2129 pshufb m0, m6 ; 2 ~ 3 ~ 2130 pmaddubsw m2, m7 ; subpel_filters 2131 pmaddubsw m0, m7 ; subpel_filters 2132 phaddw m2, m0 ; 0 1 2 3 2133 pmulhrsw m2, w8192reg 2134%if ARCH_X86_32 2135 movq m3, [srcq+ssq*0] ; 4 2136 movhps m3, [srcq+ssq*1] ; 4 _ 5 2137 lea srcq, [srcq+ssq*2] 2138%else 2139 movq m3, [srcq+ssq*1] ; 4 2140 movhps m3, [srcq+ssq*2] ; 4 _ 5 2141 add srcq, ss3q 2142%endif 2143 movq m0, [srcq+ssq*0] ; 6 2144 pshufb m3, m6 ; 4 ~ 5 ~ 2145 pshufb m0, m6 ; 6 ~ 2146 pmaddubsw m3, m7 ; subpel_filters 2147 pmaddubsw m0, m7 ; subpel_filters 2148 phaddw m3, m0 ; 4 5 6 _ 2149 pmulhrsw m3, w8192reg 2150 palignr m4, m3, m2, 4; V 1 2 3 4 2151 punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2 2152 punpckhwd m2, m4 ; V 23 34 2 3 3 4 2153 pshufd m0, m3, q2121; V 5 6 5 6 2154 punpcklwd m3, m0 ; V 45 56 4 5 5 6 2155.hv_w2_loop: 2156 movq m4, [srcq+ssq*1] ; V 7 2157 lea srcq, [srcq+ssq*2] ; V 2158 movhps m4, [srcq+ssq*0] ; V 7 8 2159 pshufb m4, m6 2160 pmaddubsw m4, m7 2161 pmaddwd m5, m1, subpelv0; V a0 b0 2162 mova m1, m2 ; V 2163 pmaddwd m2, subpelv1 ; V a1 b1 2164 paddd m5, m2 ; V 2165 mova m2, m3 ; V 2166 pmaddwd m3, subpelv2 ; a2 b2 2167 phaddw m4, m4 2168 pmulhrsw m4, w8192reg 2169 paddd m5, m3 ; V 2170 palignr m3, m4, m0, 12 2171 mova m0, m4 2172 punpcklwd m3, m0 ; V 67 78 2173 pmaddwd m4, m3, subpelv3 ; V a3 b3 2174 paddd m5, d512reg 2175 paddd m5, m4 2176 psrad m5, 10 2177 packssdw m5, m5 2178 packuswb m5, m5 2179 movd r4d, m5 2180 mov [dstq+dsq*0], r4w 2181 shr r4d, 16 2182 mov [dstq+dsq*1], r4w 2183 lea dstq, [dstq+dsq*2] 2184 sub hd, 2 2185 jg .hv_w2_loop 2186 RET 2187%undef w8192reg 2188%undef d512reg 2189.hv_w4: 2190%define hv4_line_0_0 4 2191%define hv4_line_0_1 5 2192%define hv4_line_0_2 6 2193%define hv4_line_0_3 7 2194%define hv4_line_0_4 8 2195%define hv4_line_0_5 9 2196%define hv4_line_1_0 10 2197%define hv4_line_1_1 11 2198%define hv4_line_1_2 12 2199%define hv4_line_1_3 13 2200%macro SAVELINE_W4 3 2201 mova [rsp+mmsize*hv4_line_%3_%2], %1 2202%endmacro 2203%macro RESTORELINE_W4 3 2204 mova %1, [rsp+mmsize*hv4_line_%3_%2] 2205%endmacro 2206%if ARCH_X86_32 2207 %define w8192reg [base+pw_8192] 2208 %define d512reg [base+pd_512] 2209%else 2210 %define w8192reg m8 2211 %define d512reg m9 2212%endif 2213 ; lower shuffle 0 1 2 3 4 2214 mova m6, [base+subpel_h_shuf4] 2215 movq m5, [srcq+ssq*0] ; 0 _ _ _ 2216 movhps m5, [srcq+ssq*1] ; 0 _ 1 _ 2217%if ARCH_X86_32 2218 lea srcq, [srcq+ssq*2] 2219 movq m4, [srcq+ssq*0] ; 2 _ _ _ 2220 movhps m4, [srcq+ssq*1] ; 2 _ 3 _ 2221 lea srcq, [srcq+ssq*2] 2222%else 2223 movq m4, [srcq+ssq*2] ; 2 _ _ _ 2224 movhps m4, [srcq+ss3q ] ; 2 _ 3 _ 2225 lea srcq, [srcq+ssq*4] 2226%endif 2227 pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ 2228 pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ 2229 pmaddubsw m2, m7 ;H subpel_filters 2230 pmaddubsw m0, m7 ;H subpel_filters 2231 phaddw m2, m0 ;H 0 1 2 3 2232 pmulhrsw m2, w8192reg ;H pw_8192 2233 SAVELINE_W4 m2, 2, 0 2234 ; upper shuffle 2 3 4 5 6 2235 mova m6, [base+subpel_h_shuf4+16] 2236 pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ 2237 pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ 2238 pmaddubsw m2, m7 ;H subpel_filters 2239 pmaddubsw m0, m7 ;H subpel_filters 2240 phaddw m2, m0 ;H 0 1 2 3 2241 pmulhrsw m2, w8192reg ;H pw_8192 2242 ; 2243 ; lower shuffle 2244 mova m6, [base+subpel_h_shuf4] 2245 movq m5, [srcq+ssq*0] ; 4 _ _ _ 2246 movhps m5, [srcq+ssq*1] ; 4 _ 5 _ 2247%if ARCH_X86_32 2248 lea srcq, [srcq+ssq*2] 2249 movq m4, [srcq+ssq*0] ; 6 _ _ _ 2250 add srcq, ssq 2251%else 2252 movq m4, [srcq+ssq*2] ; 6 _ _ _ 2253 add srcq, ss3q 2254%endif 2255 pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ 2256 pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ 2257 pmaddubsw m3, m7 ;H subpel_filters 2258 pmaddubsw m0, m7 ;H subpel_filters 2259 phaddw m3, m0 ;H 4 5 6 7 2260 pmulhrsw m3, w8192reg ;H pw_8192 2261 SAVELINE_W4 m3, 3, 0 2262 ; upper shuffle 2263 mova m6, [base+subpel_h_shuf4+16] 2264 pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ 2265 pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ 2266 pmaddubsw m3, m7 ;H subpel_filters 2267 pmaddubsw m0, m7 ;H subpel_filters 2268 phaddw m3, m0 ;H 4 5 6 7 2269 pmulhrsw m3, w8192reg ;H pw_8192 2270 ;process high 2271 palignr m4, m3, m2, 4;V 1 2 3 4 2272 punpcklwd m1, m2, m4 ; V 01 12 2273 punpckhwd m2, m4 ; V 23 34 2274 pshufd m0, m3, q2121;V 5 6 5 6 2275 punpcklwd m3, m0 ; V 45 56 2276 SAVELINE_W4 m0, 0, 1 2277 SAVELINE_W4 m1, 1, 1 2278 SAVELINE_W4 m2, 2, 1 2279 SAVELINE_W4 m3, 3, 1 2280 ;process low 2281 RESTORELINE_W4 m2, 2, 0 2282 RESTORELINE_W4 m3, 3, 0 2283 palignr m4, m3, m2, 4;V 1 2 3 4 2284 punpcklwd m1, m2, m4 ; V 01 12 2285 punpckhwd m2, m4 ; V 23 34 2286 pshufd m0, m3, q2121;V 5 6 5 6 2287 punpcklwd m3, m0 ; V 45 56 2288.hv_w4_loop: 2289 ;process low 2290 pmaddwd m5, m1, subpelv0 ; V a0 b0 2291 mova m1, m2 2292 pmaddwd m2, subpelv1; V a1 b1 2293 paddd m5, m2 2294 mova m2, m3 2295 pmaddwd m3, subpelv2; V a2 b2 2296 paddd m5, m3 2297 mova m6, [base+subpel_h_shuf4] 2298 movq m4, [srcq+ssq*0] ; 7 2299 movhps m4, [srcq+ssq*1] ; 7 _ 8 _ 2300 pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ 2301 pmaddubsw m4, m7 ;H subpel_filters 2302 phaddw m4, m4 ;H 7 8 7 8 2303 pmulhrsw m4, w8192reg ;H pw_8192 2304 palignr m3, m4, m0, 12 ; 6 7 8 7 2305 mova m0, m4 2306 punpcklwd m3, m4 ; 67 78 2307 pmaddwd m4, m3, subpelv3; a3 b3 2308 paddd m5, d512reg ; pd_512 2309 paddd m5, m4 2310 psrad m5, 10 2311 SAVELINE_W4 m0, 0, 0 2312 SAVELINE_W4 m1, 1, 0 2313 SAVELINE_W4 m2, 2, 0 2314 SAVELINE_W4 m3, 3, 0 2315 SAVELINE_W4 m5, 5, 0 2316 ;process high 2317 RESTORELINE_W4 m0, 0, 1 2318 RESTORELINE_W4 m1, 1, 1 2319 RESTORELINE_W4 m2, 2, 1 2320 RESTORELINE_W4 m3, 3, 1 2321 pmaddwd m5, m1, subpelv0; V a0 b0 2322 mova m1, m2 2323 pmaddwd m2, subpelv1; V a1 b1 2324 paddd m5, m2 2325 mova m2, m3 2326 pmaddwd m3, subpelv2; V a2 b2 2327 paddd m5, m3 2328 mova m6, [base+subpel_h_shuf4+16] 2329 movq m4, [srcq+ssq*0] ; 7 2330 movhps m4, [srcq+ssq*1] ; 7 _ 8 _ 2331 lea srcq, [srcq+ssq*2] 2332 pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ 2333 pmaddubsw m4, m7 ;H subpel_filters 2334 phaddw m4, m4 ;H 7 8 7 8 2335 pmulhrsw m4, w8192reg ;H pw_8192 2336 palignr m3, m4, m0, 12 ; 6 7 8 7 2337 mova m0, m4 2338 punpcklwd m3, m4 ; 67 78 2339 pmaddwd m4, m3, subpelv3; a3 b3 2340 paddd m5, d512reg ; pd_512 2341 paddd m5, m4 2342 psrad m4, m5, 10 2343 RESTORELINE_W4 m5, 5, 0 2344 packssdw m5, m4 ; d -> w 2345 packuswb m5, m5 ; w -> b 2346 pshuflw m5, m5, q3120 2347 movd [dstq+dsq*0], m5 2348 psrlq m5, 32 2349 movd [dstq+dsq*1], m5 2350 lea dstq, [dstq+dsq*2] 2351 sub hd, 2 2352 SAVELINE_W4 m0, 0, 1 2353 SAVELINE_W4 m1, 1, 1 2354 SAVELINE_W4 m2, 2, 1 2355 SAVELINE_W4 m3, 3, 1 2356 RESTORELINE_W4 m0, 0, 0 2357 RESTORELINE_W4 m1, 1, 0 2358 RESTORELINE_W4 m2, 2, 0 2359 RESTORELINE_W4 m3, 3, 0 2360 jg .hv_w4_loop 2361 RET 2362%undef subpelv0 2363%undef subpelv1 2364%undef subpelv2 2365%undef subpelv3 2366.hv_w8: 2367 RESET_STACK_STATE 2368%define hv8_line_1 0 2369%define hv8_line_2 1 2370%define hv8_line_3 2 2371%define hv8_line_4 3 2372%define hv8_line_6 4 2373%macro SAVELINE_W8 2 2374 mova [rsp+hv8_line_%1*mmsize], %2 2375%endmacro 2376%macro RESTORELINE_W8 2 2377 mova %2, [rsp+hv8_line_%1*mmsize] 2378%endmacro 2379 shr mxd, 16 2380 sub srcq, 3 2381%if ARCH_X86_32 2382 %define base_reg r1 2383 %define subpelh0 [rsp+mmsize*5] 2384 %define subpelh1 [rsp+mmsize*6] 2385 %define subpelv0 [rsp+mmsize*7] 2386 %define subpelv1 [rsp+mmsize*8] 2387 %define subpelv2 [rsp+mmsize*9] 2388 %define subpelv3 [rsp+mmsize*10] 2389 %define accuv0 [rsp+mmsize*11] 2390 %define accuv1 [rsp+mmsize*12] 2391 movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3] 2392 movzx mxd, ssb 2393 shr ssd, 16 2394 cmp hd, 6 2395 cmovs ssd, mxd 2396 movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3] 2397 mov ssq, ssmp 2398 ALLOC_STACK -mmsize*13 2399%if STACK_ALIGNMENT < 16 2400 %define dstm [rsp+mmsize*13+gprsize*1] 2401 %define dsm [rsp+mmsize*13+gprsize*2] 2402 mov r6, [rstk+stack_offset+gprsize*2] 2403 mov dsm, r6 2404%endif 2405 pshufd m0, m1, q0000 2406 pshufd m1, m1, q1111 2407 punpcklbw m5, m5 2408 psraw m5, 8 ; sign-extend 2409 pshufd m2, m5, q0000 2410 pshufd m3, m5, q1111 2411 pshufd m4, m5, q2222 2412 pshufd m5, m5, q3333 2413 mova subpelh0, m0 2414 mova subpelh1, m1 2415 mova subpelv0, m2 2416 mova subpelv1, m3 2417 mova subpelv2, m4 2418 mova subpelv3, m5 2419 lea r6, [ssq*3] 2420 mov dstm, dstq 2421 sub srcq, r6 2422%else 2423 ALLOC_STACK 16*5, 16 2424 %define subpelh0 m10 2425 %define subpelh1 m11 2426 %define subpelv0 m12 2427 %define subpelv1 m13 2428 %define subpelv2 m14 2429 %define subpelv3 m15 2430 %define accuv0 m8 2431 %define accuv1 m9 2432 movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3] 2433 movzx mxd, myb 2434 shr myd, 16 2435 cmp hd, 6 2436 cmovs myd, mxd 2437 movq m1, [base_reg+myq*8+subpel_filters-put_ssse3] 2438 pshufd subpelh0, m0, q0000 2439 pshufd subpelh1, m0, q1111 2440 punpcklbw m1, m1 2441 psraw m1, 8 ; sign-extend 2442 pshufd subpelv0, m1, q0000 2443 pshufd subpelv1, m1, q1111 2444 pshufd subpelv2, m1, q2222 2445 pshufd subpelv3, m1, q3333 2446 lea ss3q, [ssq*3] 2447 mov r7, dstq 2448 sub srcq, ss3q 2449%endif 2450 shl wd, 14 2451 lea r6d, [hq+wq-(1<<16)] 2452 mov r4, srcq 2453.hv_w8_loop0: 2454 movu m4, [srcq+ssq*0] ; 0 = _ _ 2455 movu m5, [srcq+ssq*1] ; 1 = _ _ 2456%if ARCH_X86_32 2457 lea srcq, [srcq+ssq*2] 2458%endif 2459%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] 2460 %if ARCH_X86_32 2461 pshufb %3, %1, [base+subpel_h_shufB] 2462 pshufb %4, %1, [base+subpel_h_shufC] 2463 pshufb %1, [base+subpel_h_shufA] 2464 %else 2465 pshufb %3, %1, %6 ; subpel_h_shufB 2466 pshufb %4, %1, %7 ; subpel_h_shufC 2467 pshufb %1, %5 ; subpel_h_shufA 2468 %endif 2469 pmaddubsw %2, %3, subpelh0 ; subpel +0 C0 2470 pmaddubsw %4, subpelh1; subpel +4 B4 2471 pmaddubsw %3, subpelh1; C4 2472 pmaddubsw %1, subpelh0; A0 2473 paddw %2, %4 ; C0+B4 2474 paddw %1, %3 ; A0+C4 2475 phaddw %1, %2 2476%endmacro 2477%if ARCH_X86_64 2478 mova m7, [base+subpel_h_shufA] 2479 mova m8, [base+subpel_h_shufB] 2480 mova m9, [base+subpel_h_shufC] 2481%endif 2482 HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~ 2483 HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~ 2484%if ARCH_X86_32 2485 movu m6, [srcq+ssq*0] ; 2 = _ _ 2486 movu m0, [srcq+ssq*1] ; 3 = _ _ 2487 lea srcq, [srcq+ssq*2] 2488%else 2489 movu m6, [srcq+ssq*2] ; 2 = _ _ 2490 add srcq, ss3q 2491 movu m0, [srcq+ssq*0] ; 3 = _ _ 2492%endif 2493 HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~ 2494 HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~ 2495 mova m7, [base+pw_8192] 2496 pmulhrsw m4, m7 ; H pw_8192 2497 pmulhrsw m5, m7 ; H pw_8192 2498 pmulhrsw m6, m7 ; H pw_8192 2499 pmulhrsw m0, m7 ; H pw_8192 2500 punpcklwd m1, m4, m5 ; 0 1 ~ 2501 punpcklwd m2, m5, m6 ; 1 2 ~ 2502 punpcklwd m3, m6, m0 ; 2 3 ~ 2503 SAVELINE_W8 1, m1 2504 SAVELINE_W8 2, m2 2505 SAVELINE_W8 3, m3 2506 mova m7, [base+subpel_h_shufA] 2507%if ARCH_X86_32 2508 movu m4, [srcq+ssq*0] ; 4 = _ _ 2509 movu m5, [srcq+ssq*1] ; 5 = _ _ 2510 lea srcq, [srcq+ssq*2] 2511%else 2512 movu m4, [srcq+ssq*1] ; 4 = _ _ 2513 movu m5, [srcq+ssq*2] ; 5 = _ _ 2514 add srcq, ss3q 2515%endif 2516 movu m6, [srcq+ssq*0] ; 6 = _ _ 2517 HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~ 2518 HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~ 2519 HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~ 2520 mova m7, [base+pw_8192] 2521 pmulhrsw m1, m4, m7 ; H pw_8192 4 ~ 2522 pmulhrsw m2, m5, m7 ; H pw_8192 5 ~ 2523 pmulhrsw m3, m6, m7 ; H pw_8192 6 ~ 2524 punpcklwd m4, m0, m1 ; 3 4 ~ 2525 punpcklwd m5, m1, m2 ; 4 5 ~ 2526 punpcklwd m6, m2, m3 ; 5 6 ~ 2527 SAVELINE_W8 6, m3 2528 RESTORELINE_W8 1, m1 2529 RESTORELINE_W8 2, m2 2530 RESTORELINE_W8 3, m3 2531.hv_w8_loop: 2532 ; m8 accu for V a 2533 ; m9 accu for V b 2534 SAVELINE_W8 1, m3 2535 SAVELINE_W8 2, m4 2536 SAVELINE_W8 3, m5 2537 SAVELINE_W8 4, m6 2538%if ARCH_X86_32 2539 pmaddwd m0, m1, subpelv0 ; a0 2540 pmaddwd m7, m2, subpelv0 ; b0 2541 pmaddwd m3, subpelv1 ; a1 2542 pmaddwd m4, subpelv1 ; b1 2543 paddd m0, m3 2544 paddd m7, m4 2545 pmaddwd m5, subpelv2 ; a2 2546 pmaddwd m6, subpelv2 ; b2 2547 paddd m0, m5 2548 paddd m7, m6 2549 mova m5, [base+pd_512] 2550 paddd m0, m5 ; pd_512 2551 paddd m7, m5 ; pd_512 2552 mova accuv0, m0 2553 mova accuv1, m7 2554%else 2555 pmaddwd m8, m1, subpelv0 ; a0 2556 pmaddwd m9, m2, subpelv0 ; b0 2557 pmaddwd m3, subpelv1 ; a1 2558 pmaddwd m4, subpelv1 ; b1 2559 paddd m8, m3 2560 paddd m9, m4 2561 pmaddwd m5, subpelv2 ; a2 2562 pmaddwd m6, subpelv2 ; b2 2563 paddd m8, m5 2564 paddd m9, m6 2565 mova m7, [base+pd_512] 2566 paddd m8, m7 ; pd_512 2567 paddd m9, m7 ; pd_512 2568 mova m7, [base+subpel_h_shufB] 2569 mova m6, [base+subpel_h_shufC] 2570 mova m5, [base+subpel_h_shufA] 2571%endif 2572 movu m0, [srcq+ssq*1] ; 7 2573 movu m4, [srcq+ssq*2] ; 8 2574 lea srcq, [srcq+ssq*2] 2575 HV_H_W8 m0, m1, m2, m3, m5, m7, m6 2576 HV_H_W8 m4, m1, m2, m3, m5, m7, m6 2577 mova m5, [base+pw_8192] 2578 pmulhrsw m0, m5 ; H pw_8192 2579 pmulhrsw m4, m5 ; H pw_8192 2580 RESTORELINE_W8 6, m6 2581 punpcklwd m5, m6, m0 ; 6 7 ~ 2582 punpcklwd m6, m0, m4 ; 7 8 ~ 2583 pmaddwd m1, m5, subpelv3 ; a3 2584 paddd m2, m1, accuv0 2585 pmaddwd m1, m6, subpelv3 ; b3 2586 paddd m1, m1, accuv1 ; H + V 2587 psrad m2, 10 2588 psrad m1, 10 2589 packssdw m2, m1 ; d -> w 2590 packuswb m2, m1 ; w -> b 2591 movd [dstq+dsq*0], m2 2592 psrlq m2, 32 2593%if ARCH_X86_32 2594 add dstq, dsm 2595 movd [dstq+dsq*0], m2 2596 add dstq, dsm 2597%else 2598 movd [dstq+dsq*1], m2 2599 lea dstq, [dstq+dsq*2] 2600%endif 2601 sub hd, 2 2602 jle .hv_w8_outer 2603 SAVELINE_W8 6, m4 2604 RESTORELINE_W8 1, m1 2605 RESTORELINE_W8 2, m2 2606 RESTORELINE_W8 3, m3 2607 RESTORELINE_W8 4, m4 2608 jmp .hv_w8_loop 2609.hv_w8_outer: 2610%if ARCH_X86_32 2611 mov dstq, dstm 2612 add r4, 4 2613 movzx hd, r6w 2614 add dstq, 4 2615 mov srcq, r4 2616 mov dstm, dstq 2617%else 2618 add r4, 4 2619 add r7, 4 2620 movzx hd, r6b 2621 mov srcq, r4 2622 mov dstq, r7 2623%endif 2624 sub r6d, 1<<16 2625 jg .hv_w8_loop0 2626 RET 2627 2628%macro PSHUFB_SUBPEL_H_4 5 ; dst/src1, src2/mask, tmp1, tmp2, reset_mask 2629 %if cpuflag(ssse3) 2630 pshufb %1, %2 2631 %else 2632 %if %5 == 1 2633 pcmpeqd %2, %2 2634 psrlq %2, 32 2635 %endif 2636 psrldq %3, %1, 1 2637 pshufd %3, %3, q2301 2638 pand %1, %2 2639 pandn %4, %2, %3 2640 por %1, %4 2641 %endif 2642%endmacro 2643 2644%macro PSHUFB_SUBPEL_H_4a 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask 2645 %ifnidn %1, %2 2646 mova %1, %2 2647 %endif 2648 PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6 2649%endmacro 2650 2651%macro PSHUFB_SUBPEL_H_4b 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask 2652 %if notcpuflag(ssse3) 2653 psrlq %1, %2, 16 2654 %elifnidn %1, %2 2655 mova %1, %2 2656 %endif 2657 PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6 2658%endmacro 2659 2660%macro PALIGNR 4-5 ; dst, src1, src2, shift[, tmp] 2661 %if cpuflag(ssse3) 2662 palignr %1, %2, %3, %4 2663 %else 2664 %if %0 == 4 2665 %assign %%i regnumof%+%1 + 1 2666 %define %%tmp m %+ %%i 2667 %else 2668 %define %%tmp %5 2669 %endif 2670 psrldq %1, %3, %4 2671 pslldq %%tmp, %2, 16-%4 2672 por %1, %%tmp 2673 %endif 2674%endmacro 2675 2676%macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1 2677 %if cpuflag(ssse3) 2678 phaddw %1, %2 2679 %elifnidn %1, %2 2680 %if %4 == 1 2681 mova %3, [base+pw_1] 2682 %endif 2683 pmaddwd %1, %3 2684 pmaddwd %2, %3 2685 packssdw %1, %2 2686 %else 2687 %if %4 == 1 2688 pmaddwd %1, [base+pw_1] 2689 %else 2690 pmaddwd %1, %3 2691 %endif 2692 packssdw %1, %1 2693 %endif 2694%endmacro 2695 2696%macro PMULHRSW_POW2 4 ; dst, src1, src2, shift 2697 %if cpuflag(ssse3) 2698 pmulhrsw %1, %2, %3 2699 %else 2700 paddw %1, %2, %3 2701 psraw %1, %4 2702 %endif 2703%endmacro 2704 2705%macro PMULHRSW_8192 3 ; dst, src1, src2 2706 PMULHRSW_POW2 %1, %2, %3, 2 2707%endmacro 2708 2709%macro PREP_8TAP_H_LOAD4 5 ; dst, src_memloc, tmp[1-2] 2710 movd %1, [%2+0] 2711 movd %3, [%2+1] 2712 movd %4, [%2+2] 2713 movd %5, [%2+3] 2714 punpckldq %1, %3 2715 punpckldq %4, %5 2716 punpcklqdq %1, %4 2717%endmacro 2718 2719%macro PREP_8TAP_H_LOAD 2 ; dst0, src_memloc 2720 %if cpuflag(ssse3) 2721 movu m%1, [%2] 2722 pshufb m2, m%1, m11 ; subpel_h_shufB 2723 pshufb m3, m%1, m9 ; subpel_h_shufC 2724 pshufb m%1, m10 ; subpel_h_shufA 2725 %else 2726 %if ARCH_X86_64 2727 SWAP m12, m5 2728 SWAP m13, m6 2729 SWAP m14, m7 2730 %define %%mx0 m%+%%i 2731 %define %%mx1 m%+%%j 2732 %assign %%i 0 2733 %rep 12 2734 movd %%mx0, [%2+%%i] 2735 %assign %%i %%i+1 2736 %endrep 2737 %assign %%i 0 2738 %rep 6 2739 %assign %%j %%i+1 2740 punpckldq %%mx0, %%mx1 2741 %assign %%i %%i+2 2742 %endrep 2743 %assign %%i 0 2744 %rep 3 2745 %assign %%j %%i+2 2746 punpcklqdq %%mx0, %%mx1 2747 %assign %%i %%i+4 2748 %endrep 2749 SWAP m%1, m0 2750 SWAP m2, m4 2751 SWAP m3, m8 2752 SWAP m5, m12 2753 SWAP m6, m13 2754 SWAP m7, m14 2755 %else 2756 PREP_8TAP_H_LOAD4 m0, %2+0, m1, m4, m7 2757 PREP_8TAP_H_LOAD4 m2, %2+4, m1, m4, m7 2758 PREP_8TAP_H_LOAD4 m3, %2+8, m1, m4, m7 2759 SWAP m%1, m0 2760 %endif 2761 %endif 2762%endmacro 2763 2764%macro PREP_8TAP_H 2 ; dst, src_memloc 2765 PREP_8TAP_H_LOAD %1, %2 2766 %if ARCH_X86_64 && notcpuflag(ssse3) 2767 SWAP m8, m1 2768 SWAP m9, m7 2769 %endif 2770 %xdefine mX m%+%1 2771 %assign %%i regnumof%+mX 2772 %define mX m%+%%i 2773 mova m4, m2 2774 PMADDUBSW m4, m5, m1, m7, 1 ; subpel +0 B0 2775 PMADDUBSW m2, m6, m1, m7, 0 ; subpel +4 B4 2776 PMADDUBSW m3, m6, m1, m7, 0 ; subpel +4 C4 2777 PMADDUBSW mX, m5, m1, m7, 0 ; subpel +0 A0 2778 %undef mX 2779 %if ARCH_X86_64 && notcpuflag(ssse3) 2780 SWAP m1, m8 2781 SWAP m7, m9 2782 %endif 2783 paddw m3, m4 2784 paddw m%1, m2 2785 PHADDW m%1, m3, m15, ARCH_X86_32 2786 %if ARCH_X86_64 || cpuflag(ssse3) 2787 PMULHRSW_8192 m%1, m%1, m7 2788 %else 2789 PMULHRSW_8192 m%1, m%1, [base+pw_2] 2790 %endif 2791%endmacro 2792 2793%macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2] 2794 %if cpuflag(ssse3) 2795 movu %1, [%2] 2796 pshufb m2, %1, shufB 2797 pshufb m3, %1, shufC 2798 pshufb %1, shufA 2799 %else 2800 PREP_8TAP_H_LOAD4 %1, %2+0, m1, %3, %4 2801 PREP_8TAP_H_LOAD4 m2, %2+4, m1, %3, %4 2802 PREP_8TAP_H_LOAD4 m3, %2+8, m1, %3, %4 2803 %endif 2804 mova m1, m2 2805 PMADDUBSW m1, subpelh0, %3, %4, 1 ; subpel +0 C0 2806 PMADDUBSW m3, subpelh1, %3, %4, 0 ; subpel +4 B4 2807 PMADDUBSW m2, subpelh1, %3, %4, 0 ; C4 2808 PMADDUBSW %1, subpelh0, %3, %4, 0 ; A0 2809 paddw m1, m3 ; C0+B4 2810 paddw %1, m2 ; A0+C4 2811 PHADDW %1, m1, %3, 1 2812%endmacro 2813 2814%macro PREP_8TAP 0 2815%if ARCH_X86_32 2816 DECLARE_REG_TMP 1, 2 2817%elif WIN64 2818 DECLARE_REG_TMP 6, 4 2819%else 2820 DECLARE_REG_TMP 6, 7 2821%endif 2822 2823FN prep_8tap, sharp, SHARP, SHARP 2824FN prep_8tap, sharp_smooth, SHARP, SMOOTH 2825FN prep_8tap, smooth_sharp, SMOOTH, SHARP 2826FN prep_8tap, smooth, SMOOTH, SMOOTH 2827FN prep_8tap, sharp_regular, SHARP, REGULAR 2828FN prep_8tap, regular_sharp, REGULAR, SHARP 2829FN prep_8tap, smooth_regular, SMOOTH, REGULAR 2830FN prep_8tap, regular_smooth, REGULAR, SMOOTH 2831FN prep_8tap, regular, REGULAR, REGULAR 2832 2833%if ARCH_X86_32 2834 %define base_reg r2 2835 %define base base_reg-prep%+SUFFIX 2836%else 2837 %define base_reg r7 2838 %define base 0 2839%endif 2840cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 2841 imul mxd, mxm, 0x010101 2842 add mxd, t0d ; 8tap_h, mx, 4tap_h 2843 imul myd, mym, 0x010101 2844 add myd, t1d ; 8tap_v, my, 4tap_v 2845 mov wd, wm 2846 movifnidn srcd, srcm 2847 movifnidn hd, hm 2848 test mxd, 0xf00 2849 jnz .h 2850 test myd, 0xf00 2851 jnz .v 2852 LEA base_reg, prep_ssse3 2853 tzcnt wd, wd 2854 movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2] 2855 pxor m4, m4 2856 add wq, base_reg 2857 movifnidn strided, stridem 2858 lea r6, [strideq*3] 2859%if WIN64 2860 pop r8 2861 pop r7 2862%endif 2863 jmp wq 2864.h: 2865 LEA base_reg, prep%+SUFFIX 2866 test myd, 0xf00 2867 jnz .hv 2868%if cpuflag(ssse3) 2869 WIN64_SPILL_XMM 12 2870%else 2871 WIN64_SPILL_XMM 16 2872%endif 2873%if ARCH_X86_32 2874 %define strideq r6 2875 mov strideq, stridem 2876%endif 2877 cmp wd, 4 2878 je .h_w4 2879 tzcnt wd, wd 2880%if cpuflag(ssse3) 2881 %if ARCH_X86_64 2882 mova m10, [base+subpel_h_shufA] 2883 mova m11, [base+subpel_h_shufB] 2884 mova m9, [base+subpel_h_shufC] 2885 %else 2886 %define m10 [base+subpel_h_shufA] 2887 %define m11 [base+subpel_h_shufB] 2888 %define m9 [base+subpel_h_shufC] 2889 %endif 2890%endif 2891 shr mxd, 16 2892 sub srcq, 3 2893 movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)] 2894 movq m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] 2895%if cpuflag(ssse3) 2896 mova m7, [base+pw_8192] 2897 pshufd m5, m6, q0000 2898 pshufd m6, m6, q1111 2899%else 2900 punpcklbw m6, m6 2901 psraw m6, 8 2902 %if ARCH_X86_64 2903 mova m7, [pw_2] 2904 mova m15, [pw_1] 2905 %else 2906 %define m15 m4 2907 %endif 2908 pshufd m5, m6, q1010 2909 punpckhqdq m6, m6 2910%endif 2911 add wq, base_reg 2912 jmp wq 2913.h_w4: 2914%if ARCH_X86_32 2915 and mxd, 0x7f 2916%else 2917 movzx mxd, mxb 2918%endif 2919 dec srcq 2920 movd m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] 2921%if cpuflag(ssse3) 2922 mova m6, [base+pw_8192] 2923 mova m5, [base+subpel_h_shufA] 2924 pshufd m4, m4, q0000 2925%else 2926 mova m6, [base+pw_2] 2927 %if ARCH_X86_64 2928 mova m14, [pw_1] 2929 %else 2930 %define m14 m7 2931 %endif 2932 punpcklbw m4, m4 2933 psraw m4, 8 2934 punpcklqdq m4, m4 2935%endif 2936%if ARCH_X86_64 2937 lea stride3q, [strideq*3] 2938%endif 2939.h_w4_loop: 2940%if cpuflag(ssse3) 2941 movq m0, [srcq+strideq*0] ; 0 2942 movq m1, [srcq+strideq*1] ; 1 2943 %if ARCH_X86_32 2944 lea srcq, [srcq+strideq*2] 2945 movq m2, [srcq+strideq*0] ; 2 2946 movq m3, [srcq+strideq*1] ; 3 2947 lea srcq, [srcq+strideq*2] 2948 %else 2949 movq m2, [srcq+strideq*2] ; 2 2950 movq m3, [srcq+stride3q ] ; 3 2951 lea srcq, [srcq+strideq*4] 2952 %endif 2953 pshufb m0, m5 2954 pshufb m1, m5 2955 pshufb m2, m5 2956 pshufb m3, m5 2957%elif ARCH_X86_64 2958 movd m0, [srcq+strideq*0+0] 2959 movd m12, [srcq+strideq*0+1] 2960 movd m1, [srcq+strideq*1+0] 2961 movd m5, [srcq+strideq*1+1] 2962 movd m2, [srcq+strideq*2+0] 2963 movd m13, [srcq+strideq*2+1] 2964 movd m3, [srcq+stride3q +0] 2965 movd m7, [srcq+stride3q +1] 2966 punpckldq m0, m12 2967 punpckldq m1, m5 2968 punpckldq m2, m13 2969 punpckldq m3, m7 2970 movd m12, [srcq+strideq*0+2] 2971 movd m8, [srcq+strideq*0+3] 2972 movd m5, [srcq+strideq*1+2] 2973 movd m9, [srcq+strideq*1+3] 2974 movd m13, [srcq+strideq*2+2] 2975 movd m10, [srcq+strideq*2+3] 2976 movd m7, [srcq+stride3q +2] 2977 movd m11, [srcq+stride3q +3] 2978 lea srcq, [srcq+strideq*4] 2979 punpckldq m12, m8 2980 punpckldq m5, m9 2981 punpckldq m13, m10 2982 punpckldq m7, m11 2983 punpcklqdq m0, m12 ; 0 2984 punpcklqdq m1, m5 ; 1 2985 punpcklqdq m2, m13 ; 2 2986 punpcklqdq m3, m7 ; 3 2987%else 2988 movd m0, [srcq+strideq*0+0] 2989 movd m1, [srcq+strideq*0+1] 2990 movd m2, [srcq+strideq*0+2] 2991 movd m3, [srcq+strideq*0+3] 2992 punpckldq m0, m1 2993 punpckldq m2, m3 2994 punpcklqdq m0, m2 ; 0 2995 movd m1, [srcq+strideq*1+0] 2996 movd m2, [srcq+strideq*1+1] 2997 movd m3, [srcq+strideq*1+2] 2998 movd m7, [srcq+strideq*1+3] 2999 lea srcq, [srcq+strideq*2] 3000 punpckldq m1, m2 3001 punpckldq m3, m7 3002 punpcklqdq m1, m3 ; 1 3003 movd m2, [srcq+strideq*0+0] 3004 movd m3, [srcq+strideq*0+1] 3005 movd m7, [srcq+strideq*0+2] 3006 movd m5, [srcq+strideq*0+3] 3007 punpckldq m2, m3 3008 punpckldq m7, m5 3009 punpcklqdq m2, m7 ; 2 3010 movd m3, [srcq+strideq*1+0] 3011 movd m7, [srcq+strideq*1+1] 3012 punpckldq m3, m7 3013 movd m7, [srcq+strideq*1+2] 3014 movd m5, [srcq+strideq*1+3] 3015 lea srcq, [srcq+strideq*2] 3016 punpckldq m7, m5 3017 punpcklqdq m3, m7 ; 3 3018%endif 3019 PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2 3020 PMADDUBSW m1, m4, m5, m7, 0 3021 PMADDUBSW m2, m4, m5, m7, 0 3022 PMADDUBSW m3, m4, m5, m7, 0 3023 PHADDW m0, m1, m14, ARCH_X86_32 3024 PHADDW m2, m3, m14, 0 3025 PMULHRSW_8192 m0, m0, m6 3026 PMULHRSW_8192 m2, m2, m6 3027 mova [tmpq+16*0], m0 3028 mova [tmpq+16*1], m2 3029 add tmpq, 32 3030 sub hd, 4 3031 jg .h_w4_loop 3032 RET 3033.h_w8: 3034%if cpuflag(ssse3) 3035 PREP_8TAP_H 0, srcq+strideq*0 3036 PREP_8TAP_H 1, srcq+strideq*1 3037 mova [tmpq+16*0], m0 3038 mova [tmpq+16*1], m1 3039 lea srcq, [srcq+strideq*2] 3040 add tmpq, 32 3041 sub hd, 2 3042%else 3043 PREP_8TAP_H 0, srcq 3044 mova [tmpq], m0 3045 add srcq, strideq 3046 add tmpq, 16 3047 dec hd 3048%endif 3049 jg .h_w8 3050 RET 3051.h_w16: 3052 mov r3, -16*1 3053 jmp .h_start 3054.h_w32: 3055 mov r3, -16*2 3056 jmp .h_start 3057.h_w64: 3058 mov r3, -16*4 3059 jmp .h_start 3060.h_w128: 3061 mov r3, -16*8 3062.h_start: 3063 sub srcq, r3 3064 mov r5, r3 3065.h_loop: 3066%if cpuflag(ssse3) 3067 PREP_8TAP_H 0, srcq+r3+8*0 3068 PREP_8TAP_H 1, srcq+r3+8*1 3069 mova [tmpq+16*0], m0 3070 mova [tmpq+16*1], m1 3071 add tmpq, 32 3072 add r3, 16 3073%else 3074 PREP_8TAP_H 0, srcq+r3 3075 mova [tmpq], m0 3076 add tmpq, 16 3077 add r3, 8 3078%endif 3079 jl .h_loop 3080 add srcq, strideq 3081 mov r3, r5 3082 dec hd 3083 jg .h_loop 3084 RET 3085.v: 3086 LEA base_reg, prep%+SUFFIX 3087%if ARCH_X86_32 3088 mov mxd, myd 3089 and mxd, 0x7f 3090%else 3091 WIN64_SPILL_XMM 16 3092 movzx mxd, myb 3093%endif 3094 shr myd, 16 3095 cmp hd, 6 3096 cmovs myd, mxd 3097 movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] 3098%if cpuflag(ssse3) 3099 mova m2, [base+pw_512] 3100 mova m7, [base+pw_8192] 3101 punpcklwd m0, m0 3102%else 3103 punpcklbw m0, m0 3104 psraw m0, 8 3105%endif 3106%if ARCH_X86_32 3107 %define subpel0 [rsp+mmsize*0] 3108 %define subpel1 [rsp+mmsize*1] 3109 %define subpel2 [rsp+mmsize*2] 3110 %define subpel3 [rsp+mmsize*3] 3111%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed 3112 %if cpuflag(ssse3) 3113 ALLOC_STACK -mmsize*4 3114 %else 3115 ALLOC_STACK -mmsize*5 3116 %endif 3117%assign regs_used 7 3118 mov strideq, [rstk+stack_offset+gprsize*3] 3119 pshufd m1, m0, q0000 3120 mova subpel0, m1 3121 pshufd m1, m0, q1111 3122 mova subpel1, m1 3123 lea r5, [strideq*3] 3124 pshufd m1, m0, q2222 3125 mova subpel2, m1 3126 pshufd m1, m0, q3333 3127 mova subpel3, m1 3128 sub srcq, r5 3129%else 3130 %define subpel0 m8 3131 %define subpel1 m9 3132 %define subpel2 m10 3133 %define subpel3 m11 3134 pshufd m8, m0, q0000 3135 pshufd m9, m0, q1111 3136 lea stride3q, [strideq*3] 3137 pshufd m10, m0, q2222 3138 pshufd m11, m0, q3333 3139 sub srcq, stride3q 3140 cmp wd, 8 3141 jns .v_w8 3142%endif 3143.v_w4: 3144%if notcpuflag(ssse3) 3145 pxor m6, m6 3146 %if ARCH_X86_64 3147 mova m7, [base+pw_2] 3148 %endif 3149%endif 3150%if ARCH_X86_32 3151 %if STACK_ALIGNMENT < mmsize 3152 %define srcm [esp+stack_size+gprsize*1] 3153 %define tmpm [esp+stack_size+gprsize*2] 3154 %endif 3155 mov tmpm, tmpq 3156 mov srcm, srcq 3157 lea r5d, [wq - 4] ; horizontal loop 3158 shl r5d, (16 - 2) ; (wq / 4) << 16 3159 mov r5w, hw 3160.v_w4_loop0: 3161%endif 3162 movd m1, [srcq+strideq*0] 3163 movd m0, [srcq+strideq*1] 3164%if ARCH_X86_32 3165 lea srcq, [srcq+strideq*2] 3166 movd m2, [srcq+strideq*0] 3167 movd m4, [srcq+strideq*1] 3168 lea srcq, [srcq+strideq*2] 3169 movd m3, [srcq+strideq*0] 3170 movd m5, [srcq+strideq*1] 3171 lea srcq, [srcq+strideq*2] 3172%else 3173 movd m2, [srcq+strideq*2] 3174 add srcq, stride3q 3175 movd m4, [srcq+strideq*0] 3176 movd m3, [srcq+strideq*1] 3177 movd m5, [srcq+strideq*2] 3178 add srcq, stride3q 3179%endif 3180 punpckldq m1, m0 ; 0 1 3181 punpckldq m0, m2 ; 1 2 3182 punpcklbw m1, m0 ; 01 12 3183 movd m0, [srcq+strideq*0] 3184 punpckldq m2, m4 ; 2 3 3185 punpckldq m4, m3 ; 3 4 3186 punpckldq m3, m5 ; 4 5 3187 punpckldq m5, m0 ; 5 6 3188 punpcklbw m2, m4 ; 23 34 3189 punpcklbw m3, m5 ; 45 56 3190.v_w4_loop: 3191%if ARCH_X86_32 && notcpuflag(ssse3) 3192 mova m7, subpel0 3193 %define subpel0 m7 3194%endif 3195 mova m5, m1 3196 PMADDUBSW m5, subpel0, m6, m4, 0 ; a0 b0 3197%if ARCH_X86_32 && notcpuflag(ssse3) 3198 mova m7, subpel1 3199 %define subpel1 m7 3200%endif 3201 mova m1, m2 3202 PMADDUBSW m2, subpel1, m6, m4, 0 ; a1 b1 3203 paddw m5, m2 3204%if ARCH_X86_32 && notcpuflag(ssse3) 3205 mova m7, subpel2 3206 %define subpel2 m7 3207%endif 3208 mova m2, m3 3209 PMADDUBSW m3, subpel2, m6, m4, 0 ; a2 b2 3210 movd m4, [srcq+strideq*1] 3211 lea srcq, [srcq+strideq*2] 3212 paddw m5, m3 3213 punpckldq m3, m0, m4 ; 6 7 _ _ 3214 movd m0, [srcq+strideq*0] 3215 punpckldq m4, m0 ; 7 8 _ _ 3216 punpcklbw m3, m4 ; 67 78 3217%if notcpuflag(ssse3) 3218 %if ARCH_X86_64 3219 SWAP m12, m0 3220 %else 3221 mova [esp+mmsize*4], m0 3222 mova m7, subpel3 3223 %define subpel3 m7 3224 %endif 3225%endif 3226 mova m4, m3 3227 PMADDUBSW m4, subpel3, m6, m0, 0 ; a3 b3 3228 paddw m5, m4 3229%if ARCH_X86_64 || cpuflag(ssse3) 3230 %if notcpuflag(ssse3) 3231 SWAP m0, m12 3232 %endif 3233 PMULHRSW_8192 m5, m5, m7 3234%else 3235 mova m0, [esp+mmsize*4] 3236 PMULHRSW_8192 m5, m5, [base+pw_2] 3237%endif 3238 movq [tmpq+wq*0], m5 3239 movhps [tmpq+wq*2], m5 3240 lea tmpq, [tmpq+wq*4] 3241 sub hd, 2 3242 jg .v_w4_loop 3243%if ARCH_X86_32 3244 mov srcq, srcm 3245 mov tmpq, tmpm 3246 movzx hd, r5w 3247 add srcq, 4 3248 add tmpq, 8 3249 mov srcm, srcq 3250 mov tmpm, tmpq 3251 sub r5d, 1<<16 ; horizontal-- 3252 jg .v_w4_loop0 3253%endif 3254 RET 3255%if ARCH_X86_64 3256.v_w8: 3257 lea r6d, [wq*8-64] 3258 mov r5, srcq 3259 mov r8, tmpq 3260 lea r6d, [hq+r6*4] 3261.v_w8_loop0: 3262 movq m1, [srcq+strideq*0] 3263 movq m2, [srcq+strideq*1] 3264 movq m3, [srcq+strideq*2] 3265 add srcq, stride3q 3266 movq m4, [srcq+strideq*0] 3267 movq m5, [srcq+strideq*1] 3268 movq m6, [srcq+strideq*2] 3269 add srcq, stride3q 3270 movq m0, [srcq+strideq*0] 3271 punpcklbw m1, m2 ; 01 3272 punpcklbw m2, m3 ; 12 3273 punpcklbw m3, m4 ; 23 3274 punpcklbw m4, m5 ; 34 3275 punpcklbw m5, m6 ; 45 3276 punpcklbw m6, m0 ; 56 3277.v_w8_loop: 3278 movq m13, [srcq+strideq*1] 3279 lea srcq, [srcq+strideq*2] 3280%if cpuflag(ssse3) 3281 pmaddubsw m14, m1, subpel0 ; a0 3282 pmaddubsw m15, m2, subpel0 ; b0 3283 mova m1, m3 3284 mova m2, m4 3285 pmaddubsw m3, subpel1 ; a1 3286 pmaddubsw m4, subpel1 ; b1 3287 paddw m14, m3 3288 paddw m15, m4 3289 mova m3, m5 3290 mova m4, m6 3291 pmaddubsw m5, subpel2 ; a2 3292 pmaddubsw m6, subpel2 ; b2 3293 punpcklbw m12, m0, m13 ; 67 3294 movq m0, [srcq+strideq*0] 3295 punpcklbw m13, m0 ; 78 3296 paddw m14, m5 3297 mova m5, m12 3298 pmaddubsw m12, subpel3 ; a3 3299 paddw m15, m6 3300 mova m6, m13 3301 pmaddubsw m13, subpel3 ; b3 3302 paddw m14, m12 3303 paddw m15, m13 3304 pmulhrsw m14, m7 3305 pmulhrsw m15, m7 3306%else 3307 mova m14, m1 3308 PMADDUBSW m14, subpel0, m7, m12, 1 ; a0 3309 mova m15, m2 3310 PMADDUBSW m15, subpel0, m7, m12, 0 ; b0 3311 mova m1, m3 3312 PMADDUBSW m3, subpel1, m7, m12, 0 ; a1 3313 mova m2, m4 3314 PMADDUBSW m4, subpel1, m7, m12, 0 ; b1 3315 paddw m14, m3 3316 mova m3, m5 3317 PMADDUBSW m5, subpel2, m7, m12, 0 ; a2 3318 paddw m15, m4 3319 mova m4, m6 3320 PMADDUBSW m6, subpel2, m7, m12, 0 ; b2 3321 paddw m15, m6 3322 punpcklbw m12, m0, m13 ; 67 3323 movq m0, [srcq+strideq*0] 3324 punpcklbw m13, m0 ; 78 3325 paddw m14, m5 3326 mova m5, m12 3327 PMADDUBSW m12, subpel3, m7, m6, 0 ; a3 3328 paddw m14, m12 3329 mova m6, m13 3330 PMADDUBSW m13, subpel3, m7, m12, 0 ; b3 3331 paddw m15, m13 3332 PMULHRSW_8192 m14, m14, [base+pw_2] 3333 PMULHRSW_8192 m15, m15, [base+pw_2] 3334%endif 3335 movu [tmpq+wq*0], m14 3336 movu [tmpq+wq*2], m15 3337 lea tmpq, [tmpq+wq*4] 3338 sub hd, 2 3339 jg .v_w8_loop 3340 add r5, 8 3341 add r8, 16 3342 movzx hd, r6b 3343 mov srcq, r5 3344 mov tmpq, r8 3345 sub r6d, 1<<8 3346 jg .v_w8_loop0 3347 RET 3348%endif ;ARCH_X86_64 3349%undef subpel0 3350%undef subpel1 3351%undef subpel2 3352%undef subpel3 3353.hv: 3354 RESET_STACK_STATE 3355 cmp wd, 4 3356 jg .hv_w8 3357 and mxd, 0x7f 3358 movd m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] 3359%if ARCH_X86_32 3360 mov mxd, myd 3361 shr myd, 16 3362 and mxd, 0x7f 3363 cmp hd, 6 3364 cmovs myd, mxd 3365 movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] 3366 mov strideq, stridem 3367 %assign regs_used 6 3368 ALLOC_STACK -mmsize*14 3369 %assign regs_used 7 3370 lea r5, [strideq*3+1] 3371 sub srcq, r5 3372 %define subpelv0 [rsp+mmsize*0] 3373 %define subpelv1 [rsp+mmsize*1] 3374 %define subpelv2 [rsp+mmsize*2] 3375 %define subpelv3 [rsp+mmsize*3] 3376 punpcklbw m0, m0 3377 psraw m0, 8 3378 pshufd m6, m0, q0000 3379 mova subpelv0, m6 3380 pshufd m6, m0, q1111 3381 mova subpelv1, m6 3382 pshufd m6, m0, q2222 3383 mova subpelv2, m6 3384 pshufd m6, m0, q3333 3385 mova subpelv3, m6 3386%else 3387 movzx mxd, myb 3388 shr myd, 16 3389 cmp hd, 6 3390 cmovs myd, mxd 3391 movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] 3392 %if cpuflag(ssse3) 3393 ALLOC_STACK mmsize*14, 14 3394 %else 3395 ALLOC_STACK mmsize*14, 16 3396 %endif 3397 lea stride3q, [strideq*3] 3398 sub srcq, stride3q 3399 dec srcq 3400 %define subpelv0 m10 3401 %define subpelv1 m11 3402 %define subpelv2 m12 3403 %define subpelv3 m13 3404 punpcklbw m0, m0 3405 psraw m0, 8 3406 %if cpuflag(ssse3) 3407 mova m8, [base+pw_8192] 3408 %else 3409 mova m8, [base+pw_2] 3410 %endif 3411 mova m9, [base+pd_32] 3412 pshufd m10, m0, q0000 3413 pshufd m11, m0, q1111 3414 pshufd m12, m0, q2222 3415 pshufd m13, m0, q3333 3416%endif 3417 pshufd m7, m1, q0000 3418%if notcpuflag(ssse3) 3419 punpcklbw m7, m7 3420 psraw m7, 8 3421%endif 3422%define hv4_line_0_0 4 3423%define hv4_line_0_1 5 3424%define hv4_line_0_2 6 3425%define hv4_line_0_3 7 3426%define hv4_line_0_4 8 3427%define hv4_line_0_5 9 3428%define hv4_line_1_0 10 3429%define hv4_line_1_1 11 3430%define hv4_line_1_2 12 3431%define hv4_line_1_3 13 3432%if ARCH_X86_32 3433 %if cpuflag(ssse3) 3434 %define w8192reg [base+pw_8192] 3435 %else 3436 %define w8192reg [base+pw_2] 3437 %endif 3438 %define d32reg [base+pd_32] 3439%else 3440 %define w8192reg m8 3441 %define d32reg m9 3442%endif 3443 ; lower shuffle 0 1 2 3 4 3444%if cpuflag(ssse3) 3445 mova m6, [base+subpel_h_shuf4] 3446%else 3447 %if ARCH_X86_64 3448 mova m15, [pw_1] 3449 %else 3450 %define m15 m1 3451 %endif 3452%endif 3453 movq m5, [srcq+strideq*0] ; 0 _ _ _ 3454 movhps m5, [srcq+strideq*1] ; 0 _ 1 _ 3455%if ARCH_X86_32 3456 lea srcq, [srcq+strideq*2] 3457 movq m4, [srcq+strideq*0] ; 2 _ _ _ 3458 movhps m4, [srcq+strideq*1] ; 2 _ 3 _ 3459 lea srcq, [srcq+strideq*2] 3460%else 3461 movq m4, [srcq+strideq*2] ; 2 _ _ _ 3462 movhps m4, [srcq+stride3q ] ; 2 _ 3 _ 3463 lea srcq, [srcq+strideq*4] 3464%endif 3465 PSHUFB_SUBPEL_H_4a m2, m5, m6, m1, m3, 1 ;H subpel_h_shuf4 0~1~ 3466 PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~ 3467 PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters 3468 PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters 3469 PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3 3470 PMULHRSW_8192 m2, m2, w8192reg 3471 SAVELINE_W4 m2, 2, 0 3472 ; upper shuffle 2 3 4 5 6 3473%if cpuflag(ssse3) 3474 mova m6, [base+subpel_h_shuf4+16] 3475%endif 3476 PSHUFB_SUBPEL_H_4b m2, m5, m6, m1, m3, 0 ;H subpel_h_shuf4 0~1~ 3477 PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~ 3478 PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters 3479 PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters 3480 PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3 3481 PMULHRSW_8192 m2, m2, w8192reg 3482%if notcpuflag(ssse3) 3483 %if ARCH_X86_64 3484 SWAP m14, m2 3485 %else 3486 mova [esp+mmsize*4], m2 3487 %endif 3488%endif 3489 ; lower shuffle 3490%if cpuflag(ssse3) 3491 mova m6, [base+subpel_h_shuf4] 3492%endif 3493 movq m5, [srcq+strideq*0] ; 4 _ _ _ 3494 movhps m5, [srcq+strideq*1] ; 4 _ 5 _ 3495%if ARCH_X86_32 3496 lea srcq, [srcq+strideq*2] 3497 movq m4, [srcq+strideq*0] ; 6 _ _ _ 3498 add srcq, strideq 3499%else 3500 movq m4, [srcq+strideq*2] ; 6 _ _ _ 3501 add srcq, stride3q 3502%endif 3503 PSHUFB_SUBPEL_H_4a m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ 3504 PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ 3505 PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters 3506 PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters 3507 PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7 3508 PMULHRSW_8192 m3, m3, w8192reg 3509 SAVELINE_W4 m3, 3, 0 3510 ; upper shuffle 3511%if cpuflag(ssse3) 3512 mova m6, [base+subpel_h_shuf4+16] 3513%endif 3514 PSHUFB_SUBPEL_H_4b m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ 3515 PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ 3516 PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters 3517 PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters 3518 PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7 3519 PMULHRSW_8192 m3, m3, w8192reg 3520%if notcpuflag(ssse3) 3521 %if ARCH_X86_64 3522 SWAP m2, m14 3523 %else 3524 mova m2, [esp+mmsize*4] 3525 %endif 3526%endif 3527 ;process high 3528 PALIGNR m4, m3, m2, 4;V 1 2 3 4 3529 punpcklwd m1, m2, m4 ; V 01 12 3530 punpckhwd m2, m4 ; V 23 34 3531 pshufd m0, m3, q2121;V 5 6 5 6 3532 punpcklwd m3, m0 ; V 45 56 3533 SAVELINE_W4 m0, 0, 1 3534 SAVELINE_W4 m1, 1, 1 3535 SAVELINE_W4 m2, 2, 1 3536 SAVELINE_W4 m3, 3, 1 3537 ;process low 3538 RESTORELINE_W4 m2, 2, 0 3539 RESTORELINE_W4 m3, 3, 0 3540 PALIGNR m4, m3, m2, 4;V 1 2 3 4 3541 punpcklwd m1, m2, m4 ; V 01 12 3542 punpckhwd m2, m4 ; V 23 34 3543 pshufd m0, m3, q2121;V 5 6 5 6 3544 punpcklwd m3, m0 ; V 45 56 3545.hv_w4_loop: 3546 ;process low 3547 pmaddwd m5, m1, subpelv0 ; V a0 b0 3548 mova m1, m2 3549 pmaddwd m2, subpelv1; V a1 b1 3550 paddd m5, m2 3551 mova m2, m3 3552 pmaddwd m3, subpelv2; V a2 b2 3553 paddd m5, m3 3554%if notcpuflag(ssse3) 3555 %if ARCH_X86_64 3556 SWAP m14, m5 3557 %else 3558 mova [esp+mmsize*4], m5 3559 %define m15 m3 3560 %endif 3561%endif 3562%if cpuflag(ssse3) 3563 mova m6, [base+subpel_h_shuf4] 3564%endif 3565 movq m4, [srcq+strideq*0] ; 7 3566 movhps m4, [srcq+strideq*1] ; 7 _ 8 _ 3567 PSHUFB_SUBPEL_H_4a m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~ 3568 PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters 3569 PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878 3570 PMULHRSW_8192 m4, m4, w8192reg 3571 PALIGNR m3, m4, m0, 12, m5 ; 6787 3572 mova m0, m4 3573 punpcklwd m3, m4 ; 67 78 3574 pmaddwd m4, m3, subpelv3; a3 b3 3575%if notcpuflag(ssse3) 3576 %if ARCH_X86_64 3577 SWAP m5, m14 3578 %else 3579 mova m5, [esp+mmsize*4] 3580 %endif 3581%endif 3582 paddd m5, d32reg ; pd_32 3583 paddd m5, m4 3584 psrad m5, 6 3585 SAVELINE_W4 m0, 0, 0 3586 SAVELINE_W4 m1, 1, 0 3587 SAVELINE_W4 m2, 2, 0 3588 SAVELINE_W4 m3, 3, 0 3589 SAVELINE_W4 m5, 5, 0 3590 ;process high 3591 RESTORELINE_W4 m0, 0, 1 3592 RESTORELINE_W4 m1, 1, 1 3593 RESTORELINE_W4 m2, 2, 1 3594 RESTORELINE_W4 m3, 3, 1 3595 pmaddwd m5, m1, subpelv0; V a0 b0 3596 mova m1, m2 3597 pmaddwd m2, subpelv1; V a1 b1 3598 paddd m5, m2 3599 mova m2, m3 3600 pmaddwd m3, subpelv2; V a2 b2 3601 paddd m5, m3 3602%if notcpuflag(ssse3) 3603 %if ARCH_X86_64 3604 SWAP m14, m5 3605 %else 3606 mova [esp+0xA0], m5 3607 %endif 3608%endif 3609%if cpuflag(ssse3) 3610 mova m6, [base+subpel_h_shuf4+16] 3611%endif 3612 movq m4, [srcq+strideq*0] ; 7 3613 movhps m4, [srcq+strideq*1] ; 7 _ 8 _ 3614 PSHUFB_SUBPEL_H_4b m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~ 3615 PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters 3616 PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878 3617 PMULHRSW_8192 m4, m4, w8192reg 3618 PALIGNR m3, m4, m0, 12, m5 ; 6787 3619 mova m0, m4 3620 punpcklwd m3, m4 ; 67 78 3621 pmaddwd m4, m3, subpelv3; a3 b3 3622%if notcpuflag(ssse3) 3623 %if ARCH_X86_64 3624 SWAP m5, m14 3625 %else 3626 mova m5, [esp+0xA0] 3627 %endif 3628%endif 3629 paddd m5, d32reg ; pd_32 3630 paddd m5, m4 3631 psrad m4, m5, 6 3632 RESTORELINE_W4 m5, 5, 0 3633 packssdw m5, m4 3634 pshufd m5, m5, q3120 3635 movu [tmpq], m5 3636 lea srcq, [srcq+strideq*2] 3637 add tmpq, 16 3638 sub hd, 2 3639 SAVELINE_W4 m0, 0, 1 3640 SAVELINE_W4 m1, 1, 1 3641 SAVELINE_W4 m2, 2, 1 3642 SAVELINE_W4 m3, 3, 1 3643 RESTORELINE_W4 m0, 0, 0 3644 RESTORELINE_W4 m1, 1, 0 3645 RESTORELINE_W4 m2, 2, 0 3646 RESTORELINE_W4 m3, 3, 0 3647 jg .hv_w4_loop 3648 RET 3649%undef subpelv0 3650%undef subpelv1 3651%undef subpelv2 3652%undef subpelv3 3653.hv_w8: 3654 RESET_STACK_STATE 3655%define hv8_line_1 0 3656%define hv8_line_2 1 3657%define hv8_line_3 2 3658%define hv8_line_4 3 3659%define hv8_line_6 4 3660 shr mxd, 16 3661%if ARCH_X86_32 3662 %define subpelh0 [rsp+mmsize*5] 3663 %define subpelh1 [rsp+mmsize*6] 3664 %define subpelv0 [rsp+mmsize*7] 3665 %define subpelv1 [rsp+mmsize*8] 3666 %define subpelv2 [rsp+mmsize*9] 3667 %define subpelv3 [rsp+mmsize*10] 3668 %define accuv0 [rsp+mmsize*11] 3669 %define accuv1 [rsp+mmsize*12] 3670 movq m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] 3671 mov mxd, myd 3672 shr myd, 16 3673 and mxd, 0x7f 3674 cmp hd, 6 3675 cmovs myd, mxd 3676 movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] 3677 mov strideq, stridem 3678 %assign regs_used 6 3679 ALLOC_STACK -mmsize*14 3680 %assign regs_used 7 3681 %if STACK_ALIGNMENT < mmsize 3682 %define tmpm [rsp+mmsize*13+gprsize*1] 3683 %define srcm [rsp+mmsize*13+gprsize*2] 3684 %define stridem [rsp+mmsize*13+gprsize*3] 3685 mov tmpm, tmpq 3686 mov stridem, strideq 3687 %endif 3688 %if cpuflag(ssse3) 3689 pshufd m0, m1, q0000 3690 pshufd m1, m1, q1111 3691 %else 3692 punpcklbw m1, m1 3693 psraw m1, 8 3694 pshufd m0, m1, q1010 3695 punpckhqdq m1, m1 3696 %endif 3697 punpcklbw m5, m5 3698 psraw m5, 8 3699 pshufd m2, m5, q0000 3700 pshufd m3, m5, q1111 3701 pshufd m4, m5, q2222 3702 pshufd m5, m5, q3333 3703 mova subpelh0, m0 3704 mova subpelh1, m1 3705 mova subpelv0, m2 3706 mova subpelv1, m3 3707 mova subpelv2, m4 3708 mova subpelv3, m5 3709 lea r5, [strideq*3+3] 3710 sub srcq, r5 3711 mov srcm, srcq 3712%else 3713 ALLOC_STACK mmsize*5, 16 3714 %define subpelh0 m10 3715 %define subpelh1 m11 3716 %define subpelv0 m12 3717 %define subpelv1 m13 3718 %define subpelv2 m14 3719 %define subpelv3 m15 3720 %define accuv0 m8 3721 %define accuv1 m9 3722 movq m0, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] 3723 movzx mxd, myb 3724 shr myd, 16 3725 cmp hd, 6 3726 cmovs myd, mxd 3727 movq m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] 3728 %if cpuflag(ssse3) 3729 pshufd subpelh0, m0, q0000 3730 pshufd subpelh1, m0, q1111 3731 %else 3732 punpcklbw m0, m0 3733 psraw m0, 8 3734 pshufd subpelh0, m0, q1010 3735 pshufd subpelh1, m0, q3232 3736 mova m7, [base+pw_2] 3737 %endif 3738 punpcklbw m1, m1 3739 psraw m1, 8 3740 pshufd subpelv0, m1, q0000 3741 pshufd subpelv1, m1, q1111 3742 pshufd subpelv2, m1, q2222 3743 pshufd subpelv3, m1, q3333 3744 lea stride3q, [strideq*3] 3745 sub srcq, 3 3746 sub srcq, stride3q 3747 mov r6, srcq 3748 mov r8, tmpq 3749%endif 3750 lea r5d, [wq-4] 3751 shl r5d, 14 3752 add r5d, hd 3753.hv_w8_loop0: 3754%if cpuflag(ssse3) 3755 %if ARCH_X86_64 3756 mova m7, [base+subpel_h_shufA] 3757 mova m8, [base+subpel_h_shufB] 3758 mova m9, [base+subpel_h_shufC] 3759 %define shufA m7 3760 %define shufB m8 3761 %define shufC m9 3762 %else 3763 %define shufA [base+subpel_h_shufA] 3764 %define shufB [base+subpel_h_shufB] 3765 %define shufC [base+subpel_h_shufC] 3766 %endif 3767%endif 3768 PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 3769 PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 3770%if ARCH_X86_64 3771 PREP_8TAP_HV m6, srcq+strideq*2, m7, m0 3772 add srcq, stride3q 3773 PREP_8TAP_HV m0, srcq+strideq*0, m7, m9 3774%else 3775 lea srcq, [srcq+strideq*2] 3776 %if notcpuflag(ssse3) 3777 mova [esp], m4 3778 %endif 3779 PREP_8TAP_HV m6, srcq+strideq*0, m7, m4 3780 PREP_8TAP_HV m0, srcq+strideq*1, m7, m4 3781 lea srcq, [srcq+strideq*2] 3782%endif 3783%if cpuflag(ssse3) 3784 mova m7, [base+pw_8192] 3785%else 3786 mova m7, [base+pw_2] 3787 %if ARCH_X86_32 3788 mova m4, [esp] 3789 %endif 3790%endif 3791 PMULHRSW_8192 m4, m4, m7 3792 PMULHRSW_8192 m5, m5, m7 3793 PMULHRSW_8192 m6, m6, m7 3794 PMULHRSW_8192 m0, m0, m7 3795 punpcklwd m1, m4, m5 ; 01 3796 punpcklwd m2, m5, m6 ; 12 3797 punpcklwd m3, m6, m0 ; 23 3798 SAVELINE_W8 1, m1 3799 SAVELINE_W8 2, m2 3800 SAVELINE_W8 3, m3 3801%if cpuflag(ssse3) 3802 mova m7, [base+subpel_h_shufA] 3803%endif 3804%if ARCH_X86_64 3805 PREP_8TAP_HV m4, srcq+strideq*1, m8, m9 3806 PREP_8TAP_HV m5, srcq+strideq*2, m8, m9 3807 add srcq, stride3q 3808 PREP_8TAP_HV m6, srcq+strideq*0, m8, m9 3809%else 3810 %if notcpuflag(ssse3) 3811 mova [esp+0x30], m0 3812 %endif 3813 PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 3814 PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 3815 lea srcq, [srcq+strideq*2] 3816 PREP_8TAP_HV m6, srcq+strideq*0, m7, m0 3817%endif 3818%if cpuflag(ssse3) 3819 mova m7, [base+pw_8192] 3820%elif ARCH_X86_32 3821 mova m0, [esp+0x30] 3822 mova m7, [base+pw_2] 3823%endif 3824 PMULHRSW_8192 m1, m4, m7 3825 PMULHRSW_8192 m2, m5, m7 3826 PMULHRSW_8192 m3, m6, m7 3827 punpcklwd m4, m0, m1 ; 34 3828 punpcklwd m5, m1, m2 ; 45 3829 punpcklwd m6, m2, m3 ; 56 3830 SAVELINE_W8 6, m3 3831 RESTORELINE_W8 1, m1 3832 RESTORELINE_W8 2, m2 3833 RESTORELINE_W8 3, m3 3834.hv_w8_loop: 3835 SAVELINE_W8 1, m3 3836 SAVELINE_W8 2, m4 3837 SAVELINE_W8 3, m5 3838 SAVELINE_W8 4, m6 3839%if ARCH_X86_32 3840 pmaddwd m0, m1, subpelv0 ; a0 3841 pmaddwd m7, m2, subpelv0 ; b0 3842 pmaddwd m3, subpelv1 ; a1 3843 pmaddwd m4, subpelv1 ; b1 3844 paddd m0, m3 3845 paddd m7, m4 3846 pmaddwd m5, subpelv2 ; a2 3847 pmaddwd m6, subpelv2 ; b2 3848 paddd m0, m5 3849 paddd m7, m6 3850 mova m5, [base+pd_32] 3851 paddd m0, m5 3852 paddd m7, m5 3853 mova accuv0, m0 3854 mova accuv1, m7 3855%else 3856 pmaddwd accuv0, m1, subpelv0 ; a0 3857 pmaddwd accuv1, m2, subpelv0 ; b0 3858 pmaddwd m3, subpelv1 ; a1 3859 pmaddwd m4, subpelv1 ; b1 3860 paddd accuv0, m3 3861 paddd accuv1, m4 3862 pmaddwd m5, subpelv2 ; a2 3863 pmaddwd m6, subpelv2 ; b2 3864 paddd accuv0, m5 3865 paddd accuv1, m6 3866 mova m7, [base+pd_32] 3867 paddd accuv0, m7 3868 paddd accuv1, m7 3869 %if cpuflag(ssse3) 3870 mova m7, [base+subpel_h_shufB] 3871 mova m6, [base+subpel_h_shufC] 3872 mova m5, [base+subpel_h_shufA] 3873 %define shufA m5 3874 %define shufB m7 3875 %define shufC m6 3876 %endif 3877%endif 3878 PREP_8TAP_HV m0, srcq+strideq*1, m5, m6 3879 lea srcq, [srcq+strideq*2] 3880 PREP_8TAP_HV m4, srcq+strideq*0, m5, m6 3881%if cpuflag(ssse3) 3882 mova m5, [base+pw_8192] 3883%else 3884 mova m5, [base+pw_2] 3885%endif 3886 PMULHRSW_8192 m0, m0, m5 3887 PMULHRSW_8192 m4, m4, m5 3888 RESTORELINE_W8 6, m6 3889 punpcklwd m5, m6, m0 ; 67 3890 punpcklwd m6, m0, m4 ; 78 3891 pmaddwd m1, m5, subpelv3 ; a3 3892 paddd m2, m1, accuv0 3893 pmaddwd m1, m6, subpelv3 ; b3 3894 paddd m1, m1, accuv1 3895 psrad m2, 6 3896 psrad m1, 6 3897 packssdw m2, m1 3898 movq [tmpq+wq*0], m2 3899 movhps [tmpq+wq*2], m2 3900 lea tmpq, [tmpq+wq*4] 3901 sub hd, 2 3902 jle .hv_w8_outer 3903 SAVELINE_W8 6, m4 3904 RESTORELINE_W8 1, m1 3905 RESTORELINE_W8 2, m2 3906 RESTORELINE_W8 3, m3 3907 RESTORELINE_W8 4, m4 3908 jmp .hv_w8_loop 3909.hv_w8_outer: 3910%if ARCH_X86_32 3911 mov srcq, srcm 3912 mov tmpq, tmpm 3913 movzx hd, r5w 3914 add srcq, 4 3915 add tmpq, 8 3916 mov srcm, srcq 3917 mov tmpm, tmpq 3918%else 3919 add r6, 4 3920 add r8, 8 3921 movzx hd, r5b 3922 mov srcq, r6 3923 mov tmpq, r8 3924%endif 3925 sub r5d, 1<<16 3926 jg .hv_w8_loop0 3927 RET 3928%endmacro 3929 3930%macro movifprep 2 3931 %if isprep 3932 mov %1, %2 3933 %endif 3934%endmacro 3935 3936%macro SAVE_REG 1 3937 %xdefine r%1_save r%1 3938 %xdefine r%1q_save r%1q 3939 %xdefine r%1d_save r%1d 3940 %if ARCH_X86_32 3941 %define r%1m_save [rstk+stack_offset+(%1+1)*4] 3942 %endif 3943%endmacro 3944 3945%macro LOAD_REG 1 3946 %xdefine r%1 r%1_save 3947 %xdefine r%1q r%1q_save 3948 %xdefine r%1d r%1d_save 3949 %if ARCH_X86_32 3950 %define r%1m r%1m_save 3951 %endif 3952 %undef r%1d_save 3953 %undef r%1q_save 3954 %undef r%1_save 3955%endmacro 3956 3957%macro REMAP_REG 2-3 3958 %xdefine r%1 r%2 3959 %xdefine r%1q r%2q 3960 %xdefine r%1d r%2d 3961 %if ARCH_X86_32 3962 %if %3 == 0 3963 %xdefine r%1m r%2m 3964 %else 3965 %define r%1m [rstk+stack_offset+(%1+1)*4] 3966 %endif 3967 %endif 3968%endmacro 3969 3970%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 3971 %if isprep 3972 %if ARCH_X86_64 3973 SAVE_REG 14 3974 %assign %%i 14 3975 %rep 14 3976 %assign %%j %%i-1 3977 REMAP_REG %%i, %%j 3978 %assign %%i %%i-1 3979 %endrep 3980 %else 3981 SAVE_REG 5 3982 %assign %%i 5 3983 %rep 5 3984 %assign %%j %%i-1 3985 REMAP_REG %%i, %%j, 0 3986 %assign %%i %%i-1 3987 %endrep 3988 %endif 3989 %endif 3990%endmacro 3991 3992%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 3993 %if isprep 3994 %assign %%i 1 3995 %if ARCH_X86_64 3996 %rep 13 3997 %assign %%j %%i+1 3998 REMAP_REG %%i, %%j 3999 %assign %%i %%i+1 4000 %endrep 4001 LOAD_REG 14 4002 %else 4003 %rep 4 4004 %assign %%j %%i+1 4005 REMAP_REG %%i, %%j, 1 4006 %assign %%i %%i+1 4007 %endrep 4008 LOAD_REG 5 4009 %endif 4010 %endif 4011%endmacro 4012 4013%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged 4014 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 4015 RET 4016 %if %1 4017 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 4018 %endif 4019%endmacro 4020 4021%if ARCH_X86_64 4022 %macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3] 4023 SWAP m%2, m%5 4024 movq m%1, [srcq+ r4] 4025 movq m%2, [srcq+ r6] 4026 movhps m%1, [srcq+ r7] 4027 movhps m%2, [srcq+ r9] 4028 movq m%3, [srcq+r10] 4029 movq m%4, [srcq+r11] 4030 movhps m%3, [srcq+r13] 4031 movhps m%4, [srcq+ rX] 4032 add srcq, ssq 4033 movq m%5, [srcq+ r4] 4034 movq m%6, [srcq+ r6] 4035 movhps m%5, [srcq+ r7] 4036 movhps m%6, [srcq+ r9] 4037 movq m%7, [srcq+r10] 4038 movq m%8, [srcq+r11] 4039 movhps m%7, [srcq+r13] 4040 movhps m%8, [srcq+ rX] 4041 add srcq, ssq 4042 pmaddubsw m%1, m%9 4043 pmaddubsw m%5, m%9 4044 pmaddubsw m%2, m%10 4045 pmaddubsw m%6, m%10 4046 pmaddubsw m%3, m%11 4047 pmaddubsw m%7, m%11 4048 pmaddubsw m%4, m%12 4049 pmaddubsw m%8, m%12 4050 phaddw m%1, m%2 4051 phaddw m%5, m%6 4052 phaddw m%3, m%4 4053 phaddw m%7, m%8 4054 phaddw m%1, m%3 4055 phaddw m%5, m%7 4056 pmulhrsw m%1, m12 4057 pmulhrsw m%5, m12 4058 SWAP m%2, m%5 4059 %endmacro 4060%else 4061 %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem_start, load_fh_offsets 4062 %if %3 == 1 4063 mov r0, [esp+ 0] 4064 mov rX, [esp+ 8] 4065 mov r4, [esp+ 4] 4066 mov r5, [esp+12] 4067 %endif 4068 movq m0, [srcq+r0] 4069 movq m1, [srcq+rX] 4070 movhps m0, [srcq+r4] 4071 movhps m1, [srcq+r5] 4072 add srcq, ssq 4073 movq m4, [srcq+r0] 4074 movq m5, [srcq+rX] 4075 movhps m4, [srcq+r4] 4076 movhps m5, [srcq+r5] 4077 mov r0, [esp+16] 4078 mov rX, [esp+24] 4079 mov r4, [esp+20] 4080 mov r5, [esp+28] 4081 sub srcq, ssq 4082 movq m2, [srcq+r0] 4083 movq m3, [srcq+rX] 4084 movhps m2, [srcq+r4] 4085 movhps m3, [srcq+r5] 4086 add srcq, ssq 4087 movq m6, [srcq+r0] 4088 movq m7, [srcq+rX] 4089 movhps m6, [srcq+r4] 4090 movhps m7, [srcq+r5] 4091 add srcq, ssq 4092 pmaddubsw m0, [esp+%1+ 0] 4093 pmaddubsw m4, [esp+%1+ 0] 4094 pmaddubsw m1, [esp+%1+16] 4095 pmaddubsw m5, [esp+%1+16] 4096 pmaddubsw m2, [esp+%1+32] 4097 pmaddubsw m6, [esp+%1+32] 4098 pmaddubsw m3, [esp+%1+48] 4099 pmaddubsw m7, [esp+%1+48] 4100 phaddw m0, m1 4101 phaddw m4, m5 4102 phaddw m2, m3 4103 phaddw m6, m7 4104 phaddw m0, m2 4105 phaddw m4, m6 4106 pmulhrsw m0, m12 4107 pmulhrsw m4, m12 4108 %if %2 != 0 4109 mova [esp+%2+ 0], m0 4110 mova [esp+%2+16], m4 4111 %endif 4112 %endmacro 4113%endif 4114 4115%macro MC_8TAP_SCALED 1 4116%ifidn %1, put 4117 %assign isprep 0 4118 %if ARCH_X86_64 4119 %if required_stack_alignment <= STACK_ALIGNMENT 4120cglobal put_8tap_scaled_8bpc, 2, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy 4121 %else 4122cglobal put_8tap_scaled_8bpc, 2, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy 4123 %endif 4124 %else ; ARCH_X86_32 4125 %if required_stack_alignment <= STACK_ALIGNMENT 4126cglobal put_8tap_scaled_8bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy 4127 %else 4128cglobal put_8tap_scaled_8bpc, 0, 7, 8, -0x200-0x20, dst, ds, src, ss, w, h, mx, my, dx, dy 4129 %endif 4130 %endif 4131 %xdefine base_reg r12 4132 %define rndshift 10 4133%else ; prep 4134 %assign isprep 1 4135 %if ARCH_X86_64 4136 %if required_stack_alignment <= STACK_ALIGNMENT 4137cglobal prep_8tap_scaled_8bpc, 2, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy 4138 %xdefine tmp_stridem r14q 4139 %else 4140cglobal prep_8tap_scaled_8bpc, 2, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy 4141 %define tmp_stridem qword [rsp+0x138] 4142 %endif 4143 %xdefine base_reg r11 4144 %else ; ARCH_X86_32 4145 %if required_stack_alignment <= STACK_ALIGNMENT 4146cglobal prep_8tap_scaled_8bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy 4147 %else 4148cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy 4149 %endif 4150 %define tmp_stridem dword [esp+0x138] 4151 %endif 4152 %define rndshift 6 4153%endif 4154%if ARCH_X86_32 4155 mov [esp+0x1f0], t0d 4156 mov [esp+0x1f4], t1d 4157 %if !isprep && required_stack_alignment > STACK_ALIGNMENT 4158 mov dstd, dstm 4159 mov dsd, dsm 4160 mov srcd, srcm 4161 mov ssd, ssm 4162 mov hd, hm 4163 mov r4, mxm 4164 %define r0m [esp+0x200] 4165 %define dsm [esp+0x204] 4166 %define dsmp dsm 4167 %define r1m dsm 4168 %define r2m [esp+0x208] 4169 %define ssm [esp+0x20c] 4170 %define r3m ssm 4171 %define hm [esp+0x210] 4172 %define mxm [esp+0x214] 4173 mov r0m, dstd 4174 mov dsm, dsd 4175 mov r2m, srcd 4176 mov ssm, ssd 4177 mov hm, hd 4178 mov r0, mym 4179 mov r1, dxm 4180 mov r2, dym 4181 %define mym [esp+0x218] 4182 %define dxm [esp+0x09c] 4183 %define dym [esp+0x21c] 4184 mov mxm, r4 4185 mov mym, r0 4186 mov dxm, r1 4187 mov dym, r2 4188 tzcnt wd, wm 4189 %endif 4190 %if isprep && required_stack_alignment > STACK_ALIGNMENT 4191 %xdefine base_reg r5 4192 %else 4193 %xdefine base_reg r6 4194 %endif 4195 mov ssd, ssm 4196%endif 4197 LEA base_reg, %1_8tap_scaled_8bpc_ssse3 4198%xdefine base base_reg-%1_8tap_scaled_8bpc_ssse3 4199%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT 4200 tzcnt wd, wm 4201%endif 4202%if ARCH_X86_32 4203 %define m8 m0 4204 %define m9 m1 4205 %define m14 m4 4206 %define m15 m3 4207%endif 4208 movd m8, dxm 4209 movd m14, mxm 4210 pshufd m8, m8, q0000 4211 pshufd m14, m14, q0000 4212%if isprep && UNIX64 4213 mov r5d, t0d 4214 DECLARE_REG_TMP 5, 7 4215%endif 4216%if ARCH_X86_64 4217 mov dyd, dym 4218%endif 4219%ifidn %1, put 4220 %if WIN64 4221 mov r8d, hm 4222 DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 4223 %define hm r5m 4224 %define dxm r8m 4225 %elif ARCH_X86_64 4226 DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 4227 %define hm r6m 4228 %endif 4229 %if ARCH_X86_64 4230 %if required_stack_alignment > STACK_ALIGNMENT 4231 %define dsm [rsp+0x138] 4232 %define rX r1 4233 %define rXd r1d 4234 %else 4235 %define dsm dsq 4236 %define rX r14 4237 %define rXd r14d 4238 %endif 4239 %else 4240 %define rX r1 4241 %endif 4242%else ; prep 4243 %if WIN64 4244 mov r7d, hm 4245 DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 4246 %define hm r4m 4247 %define dxm r7m 4248 %elif ARCH_X86_64 4249 DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 4250 %define hm [rsp+0x94] 4251 %endif 4252 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 4253 %if ARCH_X86_64 4254 %define rX r14 4255 %define rXd r14d 4256 %else 4257 %define rX r3 4258 %endif 4259%endif 4260%if ARCH_X86_64 4261 mova m10, [base+pd_0x3ff] 4262 mova m12, [base+pw_8192] 4263 %ifidn %1, put 4264 mova m13, [base+pd_512] 4265 %else 4266 mova m13, [base+pd_32] 4267 %endif 4268%else 4269 %define m10 [base+pd_0x3ff] 4270 %define m12 [base+pw_8192] 4271 %ifidn %1, put 4272 %define m13 [base+pd_512] 4273 %else 4274 %define m13 [base+pd_32] 4275 %endif 4276%endif 4277 pxor m9, m9 4278%if ARCH_X86_64 4279 lea ss3q, [ssq*3] 4280 movzx r7d, t1b 4281 shr t1d, 16 4282 cmp hd, 6 4283 cmovs t1d, r7d 4284 sub srcq, ss3q 4285%else 4286 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 4287 mov r1, [esp+0x1f4] 4288 lea r0, [ssq*3] 4289 movzx r2, r1b 4290 shr r1, 16 4291 cmp dword hm, 6 4292 cmovs r1, r2 4293 mov [esp+0x1f4], r1 4294 mov r1, r1m 4295 mov r2, r2m 4296 sub srcq, r0 4297 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 4298 %define ss3q r0 4299 %define myd r4 4300 %define dyd dword dym 4301 %define hd dword hm 4302%endif 4303 cmp dyd, 1024 4304 je .dy1 4305 cmp dyd, 2048 4306 je .dy2 4307 movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] 4308 add wq, base_reg 4309 jmp wq 4310%ifidn %1, put 4311.w2: 4312 %if ARCH_X86_64 4313 mov myd, mym 4314 movzx t0d, t0b 4315 dec srcq 4316 movd m15, t0d 4317 %else 4318 movzx r4, byte [esp+0x1f0] 4319 dec srcq 4320 movd m15, r4 4321 %endif 4322 punpckldq m9, m8 4323 SWAP m8, m9 4324 paddd m14, m8 ; mx+dx*[0-1] 4325 %if ARCH_X86_64 4326 mova m11, [base+pd_0x4000] 4327 %else 4328 %define m11 [base+pd_0x4000] 4329 %endif 4330 pshufd m15, m15, q0000 4331 pand m8, m14, m10 4332 psrld m8, 6 4333 paddd m15, m8 4334 movd r4d, m15 4335 psrldq m15, 4 4336 %if ARCH_X86_64 4337 movd r6d, m15 4338 %else 4339 movd r3d, m15 4340 %endif 4341 mova m5, [base+bdct_lb_dw] 4342 mova m6, [base+subpel_s_shuf2] 4343 movd m15, [base+subpel_filters+r4*8+2] 4344 %if ARCH_X86_64 4345 movd m7, [base+subpel_filters+r6*8+2] 4346 %else 4347 movd m7, [base+subpel_filters+r3*8+2] 4348 %endif 4349 pxor m9, m9 4350 pcmpeqd m8, m9 4351 psrld m14, 10 4352 %if ARCH_X86_32 4353 mov r3, r3m 4354 pshufb m14, m5 4355 paddb m14, m6 4356 mova [rsp+0x180], m14 4357 SWAP m5, m0 4358 SWAP m6, m3 4359 %define m8 m5 4360 %define m15 m6 4361 %endif 4362 movq m0, [srcq+ssq*0] 4363 movq m2, [srcq+ssq*2] 4364 movhps m0, [srcq+ssq*1] 4365 movhps m2, [srcq+ss3q ] 4366 lea srcq, [srcq+ssq*4] 4367 %if ARCH_X86_64 4368 pshufb m14, m5 4369 paddb m14, m6 4370 %endif 4371 movq m1, [srcq+ssq*0] 4372 movq m3, [srcq+ssq*2] 4373 movhps m1, [srcq+ssq*1] 4374 movhps m3, [srcq+ss3q ] 4375 lea srcq, [srcq+ssq*4] 4376 punpckldq m15, m7 4377 punpcklqdq m15, m15 4378 %if ARCH_X86_64 4379 pand m11, m8 4380 pandn m8, m15 4381 SWAP m15, m8 4382 por m15, m11 4383 %else 4384 pand m7, m8, m11 4385 pandn m8, m15 4386 %define m8 m6 4387 %define m15 m5 4388 por m15, m7 4389 mova [rsp+0x190], m15 4390 %endif 4391 pshufb m0, m14 4392 pshufb m2, m14 4393 pshufb m1, m14 4394 pshufb m3, m14 4395 pmaddubsw m0, m15 4396 pmaddubsw m2, m15 4397 pmaddubsw m1, m15 4398 pmaddubsw m3, m15 4399 phaddw m0, m2 4400 phaddw m1, m3 4401 pmulhrsw m0, m12 ; 0 1 2 3 4402 pmulhrsw m1, m12 ; 4 5 6 7 4403 palignr m2, m1, m0, 4 ; 1 2 3 4 4404 punpcklwd m3, m0, m2 ; 01 12 4405 punpckhwd m0, m2 ; 23 34 4406 pshufd m5, m1, q0321 ; 5 6 7 _ 4407 punpcklwd m2, m1, m5 ; 45 56 4408 punpckhwd m4, m1, m5 ; 67 __ 4409 %if ARCH_X86_32 4410 mov myd, mym 4411 mov r0, r0m 4412 mova [rsp+0x1a0], m3 4413 mova [rsp+0x1b0], m0 4414 mova [rsp+0x1c0], m2 4415 mova [rsp+0x1d0], m4 4416 %endif 4417.w2_loop: 4418 and myd, 0x3ff 4419 %if ARCH_X86_64 4420 mov r6d, 64 << 24 4421 mov r4d, myd 4422 shr r4d, 6 4423 lea r4d, [t1+r4] 4424 cmovnz r6q, [base+subpel_filters+r4*8] 4425 movq m11, r6q 4426 punpcklbw m11, m11 4427 psraw m11, 8 4428 pshufd m8, m11, q0000 4429 pshufd m9, m11, q1111 4430 pshufd m10, m11, q2222 4431 pshufd m11, m11, q3333 4432 pmaddwd m5, m3, m8 4433 pmaddwd m6, m0, m9 4434 pmaddwd m7, m2, m10 4435 pmaddwd m8, m4, m11 4436 paddd m5, m6 4437 paddd m7, m8 4438 %else 4439 mov mym, myd 4440 mov r1, [esp+0x1f4] 4441 xor r3, r3 4442 shr r4, 6 4443 lea r1, [r1+r4] 4444 mov r4, 64 << 24 4445 cmovnz r4, [base+subpel_filters+r1*8+0] 4446 cmovnz r3, [base+subpel_filters+r1*8+4] 4447 movd m7, r4 4448 movd m6, r3 4449 punpckldq m7, m6 4450 punpcklbw m7, m7 4451 psraw m7, 8 4452 pshufd m5, m7, q0000 4453 pshufd m6, m7, q1111 4454 pmaddwd m3, m5 4455 pmaddwd m0, m6 4456 pshufd m5, m7, q2222 4457 pshufd m7, m7, q3333 4458 pmaddwd m2, m5 4459 pmaddwd m4, m7 4460 paddd m3, m0 4461 paddd m2, m4 4462 SWAP m5, m3 4463 SWAP m7, m2 4464 %endif 4465 paddd m5, m13 4466 paddd m5, m7 4467 psrad m5, 10 4468 packssdw m5, m5 4469 packuswb m5, m5 4470 %if ARCH_X86_64 4471 pextrw r6d, m5, 0 4472 mov [dstq], r6w 4473 add dstq, dsq 4474 dec hd 4475 jz .ret 4476 add myd, dyd 4477 %else 4478 pextrw r3d, m5, 0 4479 mov [dstq], r3w 4480 add dstq, dsm 4481 dec hd 4482 jz .ret 4483 mov myd, mym 4484 add myd, dym 4485 %endif 4486 test myd, ~0x3ff 4487 %if ARCH_X86_32 4488 SWAP m3, m5 4489 SWAP m2, m7 4490 mova m3, [rsp+0x1a0] 4491 mova m0, [rsp+0x1b0] 4492 mova m2, [rsp+0x1c0] 4493 mova m4, [rsp+0x1d0] 4494 %define m14 [esp+0x180] 4495 %define m15 [esp+0x190] 4496 %endif 4497 jz .w2_loop 4498 %if ARCH_X86_32 4499 mov r3, r3m 4500 %endif 4501 movq m5, [srcq] 4502 test myd, 0x400 4503 jz .w2_skip_line 4504 add srcq, ssq 4505 shufps m3, m0, q1032 ; 01 12 4506 shufps m0, m2, q1032 ; 23 34 4507 shufps m2, m4, q1032 ; 45 56 4508 pshufb m5, m14 4509 pmaddubsw m5, m15 4510 phaddw m5, m5 4511 pmulhrsw m5, m12 4512 palignr m4, m5, m1, 12 4513 punpcklqdq m1, m4, m4 ; 6 7 6 7 4514 punpcklwd m4, m1, m5 ; 67 __ 4515 %if ARCH_X86_32 4516 mova [rsp+0x1a0], m3 4517 mova [rsp+0x1b0], m0 4518 mova [rsp+0x1c0], m2 4519 mova [rsp+0x1d0], m4 4520 %endif 4521 jmp .w2_loop 4522.w2_skip_line: 4523 movhps m5, [srcq+ssq*1] 4524 lea srcq, [srcq+ssq*2] 4525 mova m3, m0 ; 01 12 4526 mova m0, m2 ; 23 34 4527 pshufb m5, m14 4528 pmaddubsw m5, m15 4529 phaddw m5, m5 4530 pmulhrsw m5, m12 ; 6 7 6 7 4531 palignr m4, m5, m1, 8 ; 4 5 6 7 4532 pshufd m5, m4, q0321 ; 5 6 7 _ 4533 mova m1, m4 4534 punpcklwd m2, m4, m5 ; 45 56 4535 punpckhwd m4, m5 ; 67 __ 4536 %if ARCH_X86_32 4537 mova [rsp+0x1a0], m3 4538 mova [rsp+0x1b0], m0 4539 mova [rsp+0x1c0], m2 4540 mova [rsp+0x1d0], m4 4541 %endif 4542 jmp .w2_loop 4543%endif 4544INIT_XMM ssse3 4545.w4: 4546%if ARCH_X86_64 4547 mov myd, mym 4548 movzx t0d, t0b 4549 dec srcq 4550 movd m15, t0d 4551%else 4552 %define m8 m0 4553 %xdefine m14 m4 4554 %define m15 m3 4555 movzx r4, byte [esp+0x1f0] 4556 dec srcq 4557 movd m15, r4 4558%endif 4559 pmaddwd m8, [base+rescale_mul] 4560%if ARCH_X86_64 4561 mova m11, [base+pd_0x4000] 4562%else 4563 %define m11 [base+pd_0x4000] 4564%endif 4565 pshufd m15, m15, q0000 4566 paddd m14, m8 ; mx+dx*[0-3] 4567 pand m0, m14, m10 4568 psrld m0, 6 4569 paddd m15, m0 4570 psrldq m7, m15, 8 4571%if ARCH_X86_64 4572 movd r4d, m15 4573 movd r11d, m7 4574 psrldq m15, 4 4575 psrldq m7, 4 4576 movd r6d, m15 4577 movd r13d, m7 4578 movd m15, [base+subpel_filters+ r4*8+2] 4579 movd m2, [base+subpel_filters+r11*8+2] 4580 movd m3, [base+subpel_filters+ r6*8+2] 4581 movd m4, [base+subpel_filters+r13*8+2] 4582%else 4583 movd r0, m15 4584 movd rX, m7 4585 psrldq m15, 4 4586 psrldq m7, 4 4587 movd r4, m15 4588 movd r5, m7 4589 movd m1, [base+subpel_filters+r0*8+2] 4590 movd m2, [base+subpel_filters+rX*8+2] 4591 movd m3, [base+subpel_filters+r4*8+2] 4592 movd m7, [base+subpel_filters+r5*8+2] 4593 movifprep r3, r3m 4594 SWAP m4, m7 4595 %define m15 m1 4596%endif 4597 mova m5, [base+bdct_lb_dw] 4598 movq m6, [base+subpel_s_shuf2] 4599 psrld m14, 10 4600 punpckldq m15, m3 4601 punpckldq m2, m4 4602 punpcklqdq m15, m2 4603 punpcklqdq m6, m6 4604 pshufb m14, m5 4605 paddb m14, m6 4606%if ARCH_X86_64 4607 pcmpeqd m0, m9 4608 pand m11, m0 4609%else 4610 mova [esp+0x180], m14 4611 SWAP m7, m4 4612 pxor m3, m3 4613 pcmpeqd m0, m3 4614 pand m2, m11, m0 4615 %define m11 m2 4616%endif 4617 pandn m0, m15 4618%if ARCH_X86_64 4619 SWAP m15, m0 4620%else 4621 %define m15 m0 4622%endif 4623 por m15, m11 4624%if ARCH_X86_64 4625 movu m7, [srcq+ssq*0] 4626 movu m9, [srcq+ssq*1] 4627 movu m8, [srcq+ssq*2] 4628 movu m10, [srcq+ss3q ] 4629 lea srcq, [srcq+ssq*4] 4630 movu m2, [srcq+ssq*0] 4631 movu m4, [srcq+ssq*1] 4632 movu m3, [srcq+ssq*2] 4633 movu m5, [srcq+ss3q ] 4634 lea srcq, [srcq+ssq*4] 4635 pshufb m7, m14 4636 pshufb m9, m14 4637 pshufb m8, m14 4638 pshufb m10, m14 4639 pshufb m2, m14 4640 pshufb m4, m14 4641 pshufb m3, m14 4642 pshufb m5, m14 4643 pmaddubsw m7, m15 4644 pmaddubsw m9, m15 4645 pmaddubsw m8, m15 4646 pmaddubsw m10, m15 4647 pmaddubsw m2, m15 4648 pmaddubsw m4, m15 4649 pmaddubsw m3, m15 4650 pmaddubsw m5, m15 4651 phaddw m7, m9 4652 phaddw m8, m10 4653 phaddw m9, m2, m4 4654 phaddw m3, m5 4655 pmulhrsw m7, m12 ; 0 1 4656 pmulhrsw m8, m12 ; 2 3 4657 pmulhrsw m9, m12 ; 4 5 4658 pmulhrsw m3, m12 ; 6 7 4659 shufps m4, m7, m8, q1032 ; 1 2 4660 shufps m5, m8, m9, q1032 ; 3 4 4661 shufps m6, m9, m3, q1032 ; 5 6 4662 psrldq m11, m3, 8 ; 7 _ 4663 punpcklwd m0, m7, m4 ; 01 4664 punpckhwd m7, m4 ; 12 4665 punpcklwd m1, m8, m5 ; 23 4666 punpckhwd m8, m5 ; 34 4667 punpcklwd m2, m9, m6 ; 45 4668 punpckhwd m9, m6 ; 56 4669 punpcklwd m3, m11 ; 67 4670 mova [rsp+0x00], m7 4671 mova [rsp+0x10], m8 4672 mova [rsp+0x20], m9 4673%else 4674 mova [esp+0x190], m15 4675 lea ss3q, [ssq*3] 4676 movu m2, [srcq+ssq*0] 4677 movu m3, [srcq+ssq*1] 4678 movu m7, [srcq+ssq*2] 4679 movu m6, [srcq+ss3q ] 4680 lea srcq, [srcq+ssq*4] 4681 pshufb m2, m14 4682 pshufb m3, m14 4683 pshufb m7, m14 4684 pshufb m6, m14 4685 pmaddubsw m2, m15 4686 pmaddubsw m3, m15 4687 pmaddubsw m7, m15 4688 pmaddubsw m6, m15 4689 phaddw m2, m3 4690 phaddw m7, m6 4691 movu m1, [srcq+ssq*0] 4692 movu m5, [srcq+ssq*1] 4693 movu m3, [srcq+ssq*2] 4694 movu m6, [srcq+ss3q ] 4695 lea srcq, [srcq+ssq*4] 4696 pshufb m1, m14 4697 pshufb m5, m14 4698 pshufb m3, m14 4699 pshufb m6, m14 4700 pmaddubsw m1, m15 4701 pmaddubsw m5, m15 4702 pmaddubsw m3, m15 4703 pmaddubsw m6, m15 4704 phaddw m1, m5 4705 phaddw m3, m6 4706 pmulhrsw m2, m12 4707 pmulhrsw m7, m12 4708 pmulhrsw m1, m12 4709 pmulhrsw m3, m12 4710 shufps m4, m2, m7, q1032 ; 1 2 4711 shufps m5, m7, m1, q1032 ; 3 4 4712 shufps m6, m1, m3, q1032 ; 5 6 4713 psrldq m0, m3, 8 ; 7 _ 4714 mova [esp+0x1a0], m0 4715 %define m11 [esp+0x1a0] 4716 punpcklwd m0, m2, m4 ; 01 4717 punpckhwd m2, m4 ; 12 4718 punpcklwd m4, m7, m5 ; 23 4719 punpckhwd m7, m5 ; 34 4720 punpcklwd m5, m1, m6 ; 45 4721 punpckhwd m1, m6 ; 56 4722 punpcklwd m3, [esp+0x1a0] ; 67 4723 mov myd, mym 4724 mov r0, r0m 4725 mova [esp+0x1b0], m0 ; 01 4726 mova [esp+0x1c0], m4 ; 23 4727 mova [esp+0x1d0], m5 ; 45 4728 mova [esp+0x1e0], m3 ; 67 4729 mova [rsp+0x00], m2 ; 12 4730 mova [rsp+0x10], m7 ; 34 4731 mova [rsp+0x20], m1 ; 56 4732 SWAP m1, m4 4733 SWAP m2, m5 4734%endif 4735.w4_loop: 4736 and myd, 0x3ff 4737%if ARCH_X86_64 4738 mov r6d, 64 << 24 4739 mov r4d, myd 4740 shr r4d, 6 4741 lea r4d, [t1+r4] 4742 cmovnz r6q, [base+subpel_filters+r4*8] 4743 movq m10, r6q 4744 punpcklbw m10, m10 4745 psraw m10, 8 4746 pshufd m7, m10, q0000 4747 pshufd m8, m10, q1111 4748 pshufd m9, m10, q2222 4749 pshufd m10, m10, q3333 4750 pmaddwd m4, m0, m7 4751 pmaddwd m5, m1, m8 4752 pmaddwd m6, m2, m9 4753 pmaddwd m7, m3, m10 4754 paddd m4, m5 4755 paddd m6, m7 4756 paddd m4, m13 4757 paddd m4, m6 4758%else 4759 mov mym, myd 4760 mov r5, [esp+0x1f4] 4761 xor r3, r3 4762 shr r4, 6 4763 lea r5, [r5+r4] 4764 mov r4, 64 << 24 4765 cmovnz r4, [base+subpel_filters+r5*8+0] 4766 cmovnz r3, [base+subpel_filters+r5*8+4] 4767 movd m7, r4 4768 movd m6, r3 4769 punpckldq m7, m6 4770 punpcklbw m7, m7 4771 psraw m7, 8 4772 pshufd m4, m7, q0000 4773 pshufd m5, m7, q1111 4774 pshufd m6, m7, q2222 4775 pshufd m7, m7, q3333 4776 pmaddwd m0, m4 4777 pmaddwd m1, m5 4778 pmaddwd m2, m6 4779 pmaddwd m3, m7 4780 paddd m0, m1 4781 paddd m2, m3 4782 paddd m0, m13 4783 paddd m0, m2 4784 SWAP m4, m0 4785%endif 4786 psrad m4, rndshift 4787 packssdw m4, m4 4788%ifidn %1, put 4789 packuswb m4, m4 4790 movd [dstq], m4 4791 add dstq, dsmp 4792%else 4793 movq [tmpq], m4 4794 add tmpq, 8 4795%endif 4796 dec hd 4797 jz .ret 4798%if ARCH_X86_64 4799 add myd, dyd 4800 test myd, ~0x3ff 4801 jz .w4_loop 4802%else 4803 SWAP m0, m4 4804 mov myd, mym 4805 mov r3, r3m 4806 add myd, dym 4807 test myd, ~0x3ff 4808 jnz .w4_next_line 4809 mova m0, [esp+0x1b0] 4810 mova m1, [esp+0x1c0] 4811 mova m2, [esp+0x1d0] 4812 mova m3, [esp+0x1e0] 4813 jmp .w4_loop 4814.w4_next_line: 4815 %define m14 [esp+0x180] 4816 %define m15 [esp+0x190] 4817%endif 4818 movu m4, [srcq] 4819 test myd, 0x400 4820 jz .w4_skip_line 4821%if ARCH_X86_64 4822 mova m0, [rsp+0x00] 4823 mova [rsp+0x00], m1 4824 mova m1, [rsp+0x10] 4825 mova [rsp+0x10], m2 4826 mova m2, [rsp+0x20] 4827 mova [rsp+0x20], m3 4828%else 4829 mova m5, [esp+0x1c0] 4830 mova m0, [rsp+0x000] 4831 mova [rsp+0x00], m5 4832 mova [esp+0x1b0], m0 4833 mova m6, [esp+0x1d0] 4834 mova m1, [rsp+0x010] 4835 mova [rsp+0x10], m6 4836 mova [esp+0x1c0], m1 4837 mova m7, [esp+0x1e0] 4838 mova m2, [rsp+0x020] 4839 mova [rsp+0x20], m7 4840 mova [esp+0x1d0], m2 4841%endif 4842 pshufb m4, m14 4843 pmaddubsw m4, m15 4844 phaddw m4, m4 4845 pmulhrsw m4, m12 4846 punpcklwd m3, m11, m4 4847%if ARCH_X86_32 4848 mova [esp+0x1e0], m3 4849%endif 4850 mova m11, m4 4851 add srcq, ssq 4852 jmp .w4_loop 4853.w4_skip_line: 4854%if ARCH_X86_32 4855 mova m0, [esp+0x1c0] 4856 mova m1, [esp+0x1d0] 4857 mova m2, [esp+0x1e0] 4858%endif 4859 movu m5, [srcq+ssq*1] 4860 lea srcq, [srcq+ssq*2] 4861 mova m6, [rsp+0x10] 4862 mova m7, [rsp+0x20] 4863 pshufb m4, m14 4864 pshufb m5, m14 4865 pmaddubsw m4, m15 4866 pmaddubsw m5, m15 4867 phaddw m4, m5 4868 pmulhrsw m4, m12 4869 punpcklwd m5, m11, m4 4870 mova [rsp+0x00], m6 4871 mova [rsp+0x10], m7 4872 mova [rsp+0x20], m5 4873%if ARCH_X86_64 4874 psrldq m11, m4, 8 4875 mova m0, m1 4876 mova m1, m2 4877 mova m2, m3 4878 punpcklwd m3, m4, m11 4879%else 4880 psrldq m6, m4, 8 4881 punpcklwd m3, m4, m6 4882 mova [esp+0x1a0], m6 4883 mova [esp+0x1b0], m0 4884 mova [esp+0x1c0], m1 4885 mova [esp+0x1d0], m2 4886 mova [esp+0x1e0], m3 4887%endif 4888 jmp .w4_loop 4889INIT_XMM ssse3 4890.w8: 4891 mov dword [rsp+0x90], 1 4892 movifprep tmp_stridem, 16 4893 jmp .w_start 4894.w16: 4895 mov dword [rsp+0x90], 2 4896 movifprep tmp_stridem, 32 4897 jmp .w_start 4898.w32: 4899 mov dword [rsp+0x90], 4 4900 movifprep tmp_stridem, 64 4901 jmp .w_start 4902.w64: 4903 mov dword [rsp+0x90], 8 4904 movifprep tmp_stridem, 128 4905 jmp .w_start 4906.w128: 4907 mov dword [rsp+0x90], 16 4908 movifprep tmp_stridem, 256 4909.w_start: 4910%ifidn %1, put 4911 movifnidn dsm, dsq 4912%endif 4913%if ARCH_X86_64 4914 shr t0d, 16 4915 movd m15, t0d 4916%else 4917 %define m8 m0 4918 %xdefine m14 m4 4919 %define m15 m3 4920 %if isprep 4921 %define ssq ssm 4922 %endif 4923 mov r4, [esp+0x1f0] 4924 shr r4, 16 4925 movd m15, r4 4926 mov r0, r0m 4927 mov myd, mym 4928%endif 4929 sub srcq, 3 4930 pslld m7, m8, 2 ; dx*4 4931 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] 4932 pshufd m15, m15, q0000 4933 paddd m14, m8 ; mx+dx*[0-3] 4934 mova [rsp+0x100], m7 4935 mova [rsp+0x120], m15 4936 mov [rsp+0x098], srcq 4937 mov [rsp+0x130], r0q ; dstq / tmpq 4938%if ARCH_X86_64 && UNIX64 4939 mov hm, hd 4940%elif ARCH_X86_32 4941 mov r5, hm 4942 mov [esp+0x094], myd 4943 mov [esp+0x134], r5 4944%endif 4945 jmp .hloop 4946.hloop_prep: 4947 dec dword [rsp+0x090] 4948 jz .ret 4949%if ARCH_X86_64 4950 add qword [rsp+0x130], 8*(isprep+1) 4951 mov hd, hm 4952%else 4953 add dword [esp+0x130], 8*(isprep+1) 4954 mov myd, [esp+0x094] 4955 mov r5, [esp+0x134] 4956 mov r0, [esp+0x130] 4957%endif 4958 mova m7, [rsp+0x100] 4959 mova m14, [rsp+0x110] 4960%if ARCH_X86_64 4961 mova m10, [base+pd_0x3ff] 4962%endif 4963 mova m15, [rsp+0x120] 4964 pxor m9, m9 4965 mov srcq, [rsp+0x098] 4966%if ARCH_X86_64 4967 mov r0q, [rsp+0x130] ; dstq / tmpq 4968%else 4969 mov mym, myd 4970 mov hm, r5 4971 mov r0m, r0 4972 mov r3, r3m 4973%endif 4974 paddd m14, m7 4975.hloop: 4976%if ARCH_X86_64 4977 mova m11, [base+pq_0x40000000] 4978%else 4979 %define m11 [base+pq_0x40000000] 4980%endif 4981 psrld m2, m14, 10 4982 mova [rsp], m2 4983 pand m6, m14, m10 4984 psrld m6, 6 4985 paddd m5, m15, m6 4986 pcmpeqd m6, m9 4987 psrldq m2, m5, 8 4988%if ARCH_X86_64 4989 movd r4d, m5 4990 movd r6d, m2 4991 psrldq m5, 4 4992 psrldq m2, 4 4993 movd r7d, m5 4994 movd r9d, m2 4995 movq m0, [base+subpel_filters+r4*8] 4996 movq m1, [base+subpel_filters+r6*8] 4997 movhps m0, [base+subpel_filters+r7*8] 4998 movhps m1, [base+subpel_filters+r9*8] 4999%else 5000 movd r0, m5 5001 movd rX, m2 5002 psrldq m5, 4 5003 psrldq m2, 4 5004 movd r4, m5 5005 movd r5, m2 5006 movq m0, [base+subpel_filters+r0*8] 5007 movq m1, [base+subpel_filters+rX*8] 5008 movhps m0, [base+subpel_filters+r4*8] 5009 movhps m1, [base+subpel_filters+r5*8] 5010 pxor m2, m2 5011 %define m9 m2 5012%endif 5013 paddd m14, m7 ; mx+dx*[4-7] 5014 pand m5, m14, m10 5015 psrld m5, 6 5016 paddd m15, m5 5017 pcmpeqd m5, m9 5018 mova [rsp+0x110], m14 5019 psrldq m4, m15, 8 5020%if ARCH_X86_64 5021 movd r10d, m15 5022 movd r11d, m4 5023 psrldq m15, 4 5024 psrldq m4, 4 5025 movd r13d, m15 5026 movd rXd, m4 5027 movq m2, [base+subpel_filters+r10*8] 5028 movq m3, [base+subpel_filters+r11*8] 5029 movhps m2, [base+subpel_filters+r13*8] 5030 movhps m3, [base+subpel_filters+ rX*8] 5031 psrld m14, 10 5032 psrldq m4, m14, 8 5033 movd r10d, m14 5034 movd r11d, m4 5035 psrldq m14, 4 5036 psrldq m4, 4 5037 movd r13d, m14 5038 movd rXd, m4 5039 mov r4d, [rsp+ 0] 5040 mov r6d, [rsp+ 8] 5041 mov r7d, [rsp+ 4] 5042 mov r9d, [rsp+12] 5043 pshufd m4, m6, q1100 5044 pshufd m6, m6, q3322 5045 pshufd m14, m5, q1100 5046 pshufd m5, m5, q3322 5047 pand m7, m11, m4 5048 pand m8, m11, m6 5049 pand m15, m11, m14 5050 pand m11, m11, m5 5051 pandn m4, m0 5052 pandn m6, m1 5053 pandn m14, m2 5054 pandn m5, m3 5055 por m7, m4 5056 por m8, m6 5057 por m15, m14 5058 por m11, m5 5059 mova [rsp+0x10], m7 5060 mova [rsp+0x20], m8 5061 mova [rsp+0x30], m15 5062 mova [rsp+0x40], m11 5063 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1 5064 mova [rsp+0x50], m1 5065 mova [rsp+0x60], m2 5066 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3 5067 mova [rsp+0x70], m3 5068 mova [rsp+0x80], m4 5069 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5 5070 MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7 5071 SWAP m7, m0 5072 SWAP m8, m14 5073 mova m1, [rsp+0x50] 5074 mova m2, [rsp+0x60] 5075 mova m3, [rsp+0x70] 5076 mova m9, [rsp+0x80] 5077 mov myd, mym 5078 mov dyd, dym 5079 punpcklwd m4, m5, m6 ; 45a 5080 punpckhwd m5, m6 ; 45b 5081 punpcklwd m6, m7, m8 ; 67a 5082 punpckhwd m7, m8 ; 67b 5083 punpcklwd m0, m1, m2 ; 01a 5084 punpckhwd m1, m2 ; 01b 5085 punpcklwd m2, m3, m9 ; 23a 5086 punpckhwd m3, m9 ; 23b 5087 mova [rsp+0x50], m4 5088 mova [rsp+0x60], m5 5089 mova [rsp+0x70], m6 5090 mova [rsp+0x80], m7 5091 SWAP m14, m8 5092.vloop: 5093 and myd, 0x3ff 5094 mov r6d, 64 << 24 5095 mov r4d, myd 5096 shr r4d, 6 5097 lea r4d, [t1+r4] 5098 cmovnz r6q, [base+subpel_filters+r4*8] 5099 movq m11, r6q 5100 punpcklbw m11, m11 5101 psraw m11, 8 5102 pshufd m5, m11, q0000 5103 pshufd m7, m11, q1111 5104 pshufd m10, m11, q2222 5105 pshufd m11, m11, q3333 5106 pmaddwd m4, m5, m0 5107 pmaddwd m5, m5, m1 5108 pmaddwd m6, m7, m2 5109 pmaddwd m7, m7, m3 5110 paddd m4, m13 5111 paddd m5, m13 5112 paddd m4, m6 5113 paddd m5, m7 5114 pmaddwd m6, [rsp+0x50], m10 5115 pmaddwd m7, [rsp+0x60], m10 5116 pmaddwd m8, [rsp+0x70], m11 5117 pmaddwd m9, [rsp+0x80], m11 5118 paddd m4, m6 5119 paddd m5, m7 5120 paddd m4, m8 5121 paddd m5, m9 5122%else 5123 movd r0, m15 5124 movd rX, m4 5125 psrldq m15, 4 5126 psrldq m4, 4 5127 movd r4, m15 5128 movd r5, m4 5129 mova m14, [esp+0x110] 5130 movq m2, [base+subpel_filters+r0*8] 5131 movq m3, [base+subpel_filters+rX*8] 5132 movhps m2, [base+subpel_filters+r4*8] 5133 movhps m3, [base+subpel_filters+r5*8] 5134 psrld m14, 10 5135 mova [esp+16], m14 5136 mov r0, [esp+ 0] 5137 mov rX, [esp+ 8] 5138 mov r4, [esp+ 4] 5139 mov r5, [esp+12] 5140 mova [esp+0x20], m0 5141 mova [esp+0x30], m1 5142 mova [esp+0x40], m2 5143 mova [esp+0x50], m3 5144 pshufd m4, m6, q1100 5145 pshufd m6, m6, q3322 5146 pshufd m7, m5, q1100 5147 pshufd m5, m5, q3322 5148 pand m0, m11, m4 5149 pand m1, m11, m6 5150 pand m2, m11, m7 5151 pand m3, m11, m5 5152 pandn m4, [esp+0x20] 5153 pandn m6, [esp+0x30] 5154 pandn m7, [esp+0x40] 5155 pandn m5, [esp+0x50] 5156 por m0, m4 5157 por m1, m6 5158 por m2, m7 5159 por m3, m5 5160 mova [esp+0x20], m0 5161 mova [esp+0x30], m1 5162 mova [esp+0x40], m2 5163 mova [esp+0x50], m3 5164 MC_8TAP_SCALED_H 0x20, 0x140, 0 ; 0-1 5165 MC_8TAP_SCALED_H 0x20, 0x160 ; 2-3 5166 MC_8TAP_SCALED_H 0x20, 0x180 ; 4-5 5167 MC_8TAP_SCALED_H 0x20, 0x1a0 ; 6-7 5168 mova m5, [esp+0x180] 5169 mova m6, [esp+0x190] 5170 mova m7, [esp+0x1a0] 5171 mova m0, [esp+0x1b0] 5172 mov myd, mym 5173 punpcklwd m4, m5, m6 ; 45a 5174 punpckhwd m5, m6 ; 45b 5175 punpcklwd m6, m7, m0 ; 67a 5176 punpckhwd m7, m0 ; 67b 5177 mova [esp+0x180], m4 5178 mova [esp+0x190], m5 5179 mova [esp+0x1a0], m6 5180 mova [esp+0x1b0], m7 5181 mova m1, [esp+0x140] 5182 mova m2, [esp+0x150] 5183 mova m3, [esp+0x160] 5184 mova m4, [esp+0x170] 5185 punpcklwd m0, m1, m2 ; 01a 5186 punpckhwd m1, m2 ; 01b 5187 punpcklwd m2, m3, m4 ; 23a 5188 punpckhwd m3, m4 ; 23b 5189 mova [esp+0x140], m0 5190 mova [esp+0x150], m1 5191 mova [esp+0x160], m2 5192 mova [esp+0x170], m3 5193.vloop: 5194 mov r0, r0m 5195 mov r5, [esp+0x1f4] 5196 and myd, 0x3ff 5197 mov mym, myd 5198 xor r3, r3 5199 shr r4, 6 5200 lea r5, [r5+r4] 5201 mov r4, 64 << 24 5202 cmovnz r4, [base+subpel_filters+r5*8+0] 5203 cmovnz r3, [base+subpel_filters+r5*8+4] 5204 movd m7, r4 5205 movd m6, r3 5206 punpckldq m7, m6 5207 punpcklbw m7, m7 5208 psraw m7, 8 5209 pshufd m4, m7, q0000 5210 pshufd m5, m7, q1111 5211 pmaddwd m0, m4 5212 pmaddwd m1, m4 5213 pmaddwd m2, m5 5214 pmaddwd m3, m5 5215 pshufd m6, m7, q2222 5216 pshufd m7, m7, q3333 5217 paddd m0, m2 5218 paddd m1, m3 5219 pmaddwd m2, [esp+0x180], m6 5220 pmaddwd m3, [esp+0x190], m6 5221 pmaddwd m4, [esp+0x1a0], m7 5222 pmaddwd m5, [esp+0x1b0], m7 5223 paddd m0, m2 5224 paddd m1, m3 5225 paddd m0, m13 5226 paddd m1, m13 5227 paddd m4, m0 5228 paddd m5, m1 5229%endif 5230 psrad m4, rndshift 5231 psrad m5, rndshift 5232 packssdw m4, m5 5233%ifidn %1, put 5234 packuswb m4, m4 5235 movq [dstq], m4 5236 add dstq, dsm 5237%else 5238 mova [tmpq], m4 5239 add tmpq, tmp_stridem 5240%endif 5241 dec hd 5242 jz .hloop_prep 5243%if ARCH_X86_64 5244 add myd, dyd 5245 test myd, ~0x3ff 5246 jz .vloop 5247 test myd, 0x400 5248 mov [rsp+0x140], myd 5249 mov r4d, [rsp+ 0] 5250 mov r6d, [rsp+ 8] 5251 mov r7d, [rsp+ 4] 5252 mov r9d, [rsp+12] 5253 jz .skip_line 5254 mova m14, [base+unpckw] 5255 movq m6, [srcq+r10] 5256 movq m7, [srcq+r11] 5257 movhps m6, [srcq+r13] 5258 movhps m7, [srcq+ rX] 5259 movq m4, [srcq+ r4] 5260 movq m5, [srcq+ r6] 5261 movhps m4, [srcq+ r7] 5262 movhps m5, [srcq+ r9] 5263 add srcq, ssq 5264 mov myd, [rsp+0x140] 5265 mov dyd, dym 5266 pshufd m9, m14, q1032 5267 pshufb m0, m14 ; 0a 1a 5268 pshufb m1, m14 ; 0b 1b 5269 pshufb m2, m9 ; 3a 2a 5270 pshufb m3, m9 ; 3b 2b 5271 pmaddubsw m6, [rsp+0x30] 5272 pmaddubsw m7, [rsp+0x40] 5273 pmaddubsw m4, [rsp+0x10] 5274 pmaddubsw m5, [rsp+0x20] 5275 phaddw m6, m7 5276 phaddw m4, m5 5277 phaddw m4, m6 5278 pmulhrsw m4, m12 5279 pshufb m5, [rsp+0x50], m14 ; 4a 5a 5280 pshufb m6, [rsp+0x60], m14 ; 4b 5b 5281 pshufb m7, [rsp+0x70], m9 ; 7a 6a 5282 pshufb m8, [rsp+0x80], m9 ; 7b 6b 5283 punpckhwd m0, m2 ; 12a 5284 punpckhwd m1, m3 ; 12b 5285 punpcklwd m2, m5 ; 34a 5286 punpcklwd m3, m6 ; 34b 5287 punpckhwd m5, m7 ; 56a 5288 punpckhwd m6, m8 ; 56b 5289 punpcklwd m7, m4 ; 78a 5290 punpckhqdq m4, m4 5291 punpcklwd m8, m4 ; 78b 5292 mova [rsp+0x50], m5 5293 mova [rsp+0x60], m6 5294 mova [rsp+0x70], m7 5295 mova [rsp+0x80], m8 5296 jmp .vloop 5297.skip_line: 5298 mova m0, [rsp+0x10] 5299 mova m1, [rsp+0x20] 5300 mova m14, [rsp+0x30] 5301 mova m15, [rsp+0x40] 5302 MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15 5303 mov myd, [rsp+0x140] 5304 mov dyd, dym 5305 mova m0, m2 ; 01a 5306 mova m1, m3 ; 01b 5307 mova m2, [rsp+0x50] ; 23a 5308 mova m3, [rsp+0x60] ; 23b 5309 mova m5, [rsp+0x70] ; 45a 5310 mova m6, [rsp+0x80] ; 45b 5311 punpcklwd m7, m4, m8 ; 67a 5312 punpckhwd m4, m8 ; 67b 5313 mova [rsp+0x50], m5 5314 mova [rsp+0x60], m6 5315 mova [rsp+0x70], m7 5316 mova [rsp+0x80], m4 5317%else 5318 mov r0m, r0 5319 mov myd, mym 5320 mov r3, r3m 5321 add myd, dym 5322 test myd, ~0x3ff 5323 mov mym, myd 5324 jnz .next_line 5325 mova m0, [esp+0x140] 5326 mova m1, [esp+0x150] 5327 mova m2, [esp+0x160] 5328 mova m3, [esp+0x170] 5329 jmp .vloop 5330.next_line: 5331 test myd, 0x400 5332 mov r0, [esp+ 0] 5333 mov rX, [esp+ 8] 5334 mov r4, [esp+ 4] 5335 mov r5, [esp+12] 5336 jz .skip_line 5337 mova m6, [base+unpckw] 5338 mova m0, [esp+0x140] 5339 mova m1, [esp+0x150] 5340 mova m7, [esp+0x180] 5341 movq m4, [srcq+r0] 5342 movq m5, [srcq+rX] 5343 movhps m4, [srcq+r4] 5344 movhps m5, [srcq+r5] 5345 pshufb m0, m6 ; 0a 1a 5346 pshufb m1, m6 ; 0b 1b 5347 pshufb m7, m6 ; 4a 5a 5348 mov r0, [esp+16] 5349 mov rX, [esp+24] 5350 mov r4, [esp+20] 5351 mov r5, [esp+28] 5352 movq m3, [srcq+r0] 5353 movq m2, [srcq+rX] 5354 movhps m3, [srcq+r4] 5355 movhps m2, [srcq+r5] 5356 add srcq, ssq 5357 pmaddubsw m4, [esp+0x20] 5358 pmaddubsw m5, [esp+0x30] 5359 pmaddubsw m3, [esp+0x40] 5360 pmaddubsw m2, [esp+0x50] 5361 phaddw m4, m5 5362 phaddw m3, m2 5363 mova m5, [esp+0x190] 5364 mova m2, [esp+0x160] 5365 phaddw m4, m3 5366 mova m3, [esp+0x170] 5367 pmulhrsw m4, m12 ; 8a 8b 5368 mov myd, mym 5369 pshufb m5, m6 ; 4b 5b 5370 pshufd m6, m6, q1032 5371 pshufb m2, m6 ; 3a 2a 5372 pshufb m3, m6 ; 3b 2b 5373 punpckhwd m0, m2 ; 12a 5374 punpckhwd m1, m3 ; 12b 5375 mova [esp+0x140], m0 5376 mova [esp+0x150], m1 5377 mova m0, [esp+0x1a0] 5378 mova m1, [esp+0x1b0] 5379 punpcklwd m2, m7 ; 34a 5380 punpcklwd m3, m5 ; 34b 5381 mova [esp+0x160], m2 5382 mova [esp+0x170], m3 5383 pshufb m0, m6 ; 7a 6a 5384 pshufb m1, m6 ; 7b 6b 5385 punpckhwd m7, m0 ; 56a 5386 punpckhwd m5, m1 ; 56b 5387 punpcklwd m0, m4 5388 punpckhqdq m4, m4 5389 punpcklwd m1, m4 5390 mova [esp+0x180], m7 5391 mova [esp+0x190], m5 5392 mova [esp+0x1a0], m0 5393 mova [esp+0x1b0], m1 5394 mova m0, [esp+0x140] 5395 mova m1, [esp+0x150] 5396 jmp .vloop 5397.skip_line: 5398 MC_8TAP_SCALED_H 0x20, 0x1c0, 0 5399 mov myd, mym 5400 mova m0, [esp+0x160] 5401 mova m1, [esp+0x170] 5402 mova m2, [esp+0x180] 5403 mova m3, [esp+0x190] 5404 mova [esp+0x140], m0 5405 mova [esp+0x150], m1 5406 mova m4, [esp+0x1a0] 5407 mova m5, [esp+0x1b0] 5408 mova [esp+0x160], m2 5409 mova [esp+0x170], m3 5410 mova m6, [esp+0x1c0] 5411 mova m7, [esp+0x1d0] 5412 mova [esp+0x180], m4 5413 mova [esp+0x190], m5 5414 punpcklwd m4, m6, m7 5415 punpckhwd m6, m7 5416 mova [esp+0x1a0], m4 5417 mova [esp+0x1b0], m6 5418%endif 5419 jmp .vloop 5420INIT_XMM ssse3 5421.dy1: 5422 movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] 5423 add wq, base_reg 5424 jmp wq 5425%ifidn %1, put 5426.dy1_w2: 5427 %if ARCH_X86_64 5428 mov myd, mym 5429 movzx t0d, t0b 5430 dec srcq 5431 movd m15, t0d 5432 %else 5433 %define m8 m0 5434 %define m9 m1 5435 %define m14 m4 5436 %define m15 m3 5437 movzx r5, byte [esp+0x1f0] 5438 dec srcd 5439 movd m15, r5 5440 %endif 5441 punpckldq m9, m8 5442 SWAP m8, m9 5443 paddd m14, m8 ; mx+dx*[0-1] 5444 %if ARCH_X86_64 5445 mova m11, [base+pd_0x4000] 5446 %else 5447 %define m11 [base+pd_0x4000] 5448 %endif 5449 pshufd m15, m15, q0000 5450 pand m8, m14, m10 5451 psrld m8, 6 5452 paddd m15, m8 5453 movd r4d, m15 5454 psrldq m15, 4 5455 %if ARCH_X86_64 5456 movd r6d, m15 5457 %else 5458 movd r3d, m15 5459 %endif 5460 mova m5, [base+bdct_lb_dw] 5461 mova m6, [base+subpel_s_shuf2] 5462 movd m15, [base+subpel_filters+r4*8+2] 5463 %if ARCH_X86_64 5464 movd m7, [base+subpel_filters+r6*8+2] 5465 %else 5466 movd m7, [base+subpel_filters+r3*8+2] 5467 %endif 5468 pxor m9, m9 5469 pcmpeqd m8, m9 5470 psrld m14, 10 5471 %if ARCH_X86_32 5472 mov r3, r3m 5473 pshufb m14, m5 5474 paddb m14, m6 5475 mova [esp+0x00], m14 5476 %define m14 [esp+0x00] 5477 SWAP m5, m0 5478 SWAP m6, m3 5479 %define m8 m5 5480 %define m15 m6 5481 %endif 5482 movq m0, [srcq+ssq*0] 5483 movq m2, [srcq+ssq*2] 5484 movhps m0, [srcq+ssq*1] 5485 movhps m2, [srcq+ss3q ] 5486 lea srcq, [srcq+ssq*4] 5487 %if ARCH_X86_64 5488 shr myd, 6 5489 mov r4d, 64 << 24 5490 lea myd, [t1+myq] 5491 cmovnz r4q, [base+subpel_filters+myq*8] 5492 pshufb m14, m5 5493 paddb m14, m6 5494 movq m10, r4 5495 %else 5496 mov myd, mym 5497 mov r5, [esp+0x1f4] 5498 xor r3, r3 5499 shr myd, 6 5500 lea r5, [r5+myd] 5501 mov r4, 64 << 24 5502 cmovnz r4, [base+subpel_filters+r5*8+0] 5503 cmovnz r3, [base+subpel_filters+r5*8+4] 5504 %define m10 m4 5505 movd m10, r4 5506 movd m3, r3 5507 mov r3, r3m 5508 punpckldq m10, m3 5509 %endif 5510 movq m1, [srcq+ssq*0] 5511 movq m3, [srcq+ssq*2] 5512 movhps m1, [srcq+ssq*1] 5513 add srcq, ss3q 5514 punpcklbw m10, m10 5515 psraw m10, 8 5516 punpckldq m15, m7 5517 punpcklqdq m15, m15 5518 %if ARCH_X86_64 5519 pand m11, m8 5520 %else 5521 pand m7, m11, m8 5522 %define m11 m7 5523 %endif 5524 pandn m8, m15 5525 SWAP m15, m8 5526 por m15, m11 5527 %if ARCH_X86_64 5528 pshufd m8, m10, q0000 5529 pshufd m9, m10, q1111 5530 pshufd m11, m10, q3333 5531 pshufd m10, m10, q2222 5532 %else 5533 mova [esp+0x10], m15 5534 %define m15 [esp+0x10] 5535 mov r0, r0m 5536 pshufd m5, m4, q0000 5537 pshufd m6, m4, q1111 5538 pshufd m7, m4, q2222 5539 pshufd m4, m4, q3333 5540 %define m8 [esp+0x20] 5541 %define m9 [esp+0x30] 5542 %define m10 [esp+0x40] 5543 %define m11 [esp+0x50] 5544 mova m8, m5 5545 mova m9, m6 5546 mova m10, m7 5547 mova m11, m4 5548 %endif 5549 pshufb m0, m14 5550 pshufb m2, m14 5551 pshufb m1, m14 5552 pshufb m3, m14 5553 pmaddubsw m0, m15 5554 pmaddubsw m2, m15 5555 pmaddubsw m1, m15 5556 pmaddubsw m3, m15 5557 phaddw m0, m2 5558 phaddw m1, m3 5559 pmulhrsw m0, m12 5560 pmulhrsw m1, m12 5561 palignr m2, m1, m0, 4 5562 pshufd m4, m1, q2121 5563 punpcklwd m3, m0, m2 ; 01 12 5564 punpckhwd m0, m2 ; 23 34 5565 punpcklwd m2, m1, m4 ; 45 56 5566.dy1_w2_loop: 5567 movq m1, [srcq+ssq*0] 5568 movhps m1, [srcq+ssq*1] 5569 lea srcq, [srcq+ssq*2] 5570 pmaddwd m5, m3, m8 5571 pmaddwd m6, m0, m9 5572 pmaddwd m7, m2, m10 5573 mova m3, m0 5574 mova m0, m2 5575 paddd m5, m13 5576 paddd m6, m7 5577 pshufb m1, m14 5578 pmaddubsw m1, m15 5579 phaddw m1, m1 5580 pmulhrsw m1, m12 5581 palignr m7, m1, m4, 12 5582 punpcklwd m2, m7, m1 ; 67 78 5583 pmaddwd m7, m2, m11 5584 mova m4, m1 5585 paddd m5, m6 5586 paddd m5, m7 5587 psrad m5, rndshift 5588 packssdw m5, m5 5589 packuswb m5, m5 5590 movd r4d, m5 5591 mov [dstq+dsq*0], r4w 5592 shr r4d, 16 5593 mov [dstq+dsq*1], r4w 5594 lea dstq, [dstq+dsq*2] 5595 sub hd, 2 5596 jg .dy1_w2_loop 5597 RET 5598%endif 5599INIT_XMM ssse3 5600.dy1_w4: 5601%if ARCH_X86_64 5602 mov myd, mym 5603 movzx t0d, t0b 5604 dec srcq 5605 movd m15, t0d 5606%else 5607 %define m10 [base+pd_0x3ff] 5608 %define m11 [base+pd_0x4000] 5609 %define m8 m0 5610 %xdefine m14 m4 5611 %define m15 m3 5612 %if isprep 5613 %define ssq r3 5614 %endif 5615 movzx r4, byte [esp+0x1f0] 5616 dec srcq 5617 movd m15, r4 5618%endif 5619 pmaddwd m8, [base+rescale_mul] 5620%if ARCH_X86_64 5621 mova m11, [base+pd_0x4000] 5622%endif 5623 pshufd m15, m15, q0000 5624 paddd m14, m8 ; mx+dx*[0-3] 5625 pand m8, m14, m10 5626 psrld m8, 6 5627 paddd m15, m8 5628 psrldq m7, m15, 8 5629%if ARCH_X86_64 5630 movd r4d, m15 5631 movd r11d, m7 5632 psrldq m15, 4 5633 psrldq m7, 4 5634 movd r6d, m15 5635 movd r13d, m7 5636 movd m15, [base+subpel_filters+ r4*8+2] 5637 movd m2, [base+subpel_filters+r11*8+2] 5638 movd m3, [base+subpel_filters+ r6*8+2] 5639 movd m4, [base+subpel_filters+r13*8+2] 5640 shr myd, 6 5641 mov r4d, 64 << 24 5642 lea myd, [t1+myq] 5643 cmovnz r4q, [base+subpel_filters+myq*8] 5644%else 5645 movd r1, m15 5646 movd r3, m7 5647 psrldq m15, 4 5648 psrldq m7, 4 5649 movd r4, m15 5650 movd r5, m7 5651 %define m15 m5 5652 SWAP m4, m7 5653 movd m15, [base+subpel_filters+r1*8+2] 5654 movd m2, [base+subpel_filters+r3*8+2] 5655 movd m3, [base+subpel_filters+r4*8+2] 5656 movd m4, [base+subpel_filters+r5*8+2] 5657 mov myd, mym 5658 mov rX, [esp+0x1f4] 5659 xor r5, r5 5660 shr myd, 6 5661 lea rX, [rX+myd] 5662 mov r4, 64 << 24 5663 cmovnz r4, [base+subpel_filters+rX*8+0] 5664 cmovnz r5, [base+subpel_filters+rX*8+4] 5665 mov r3, r3m 5666 %if isprep 5667 lea ss3q, [ssq*3] 5668 %endif 5669%endif 5670 punpckldq m15, m3 5671 punpckldq m2, m4 5672 punpcklqdq m15, m2 5673 movq m6, [base+subpel_s_shuf2] 5674%if ARCH_X86_64 5675 pcmpeqd m8, m9 5676 psrld m14, 10 5677 pshufb m14, [base+bdct_lb_dw] 5678 movu m0, [srcq+ssq*0] 5679 movu m1, [srcq+ssq*1] 5680 movu m2, [srcq+ssq*2] 5681 movu m3, [srcq+ss3q ] 5682 lea srcq, [srcq+ssq*4] 5683 punpcklqdq m6, m6 5684 movu m4, [srcq+ssq*0] 5685 movu m5, [srcq+ssq*1] 5686 movu m7, [srcq+ssq*2] 5687 add srcq, ss3q 5688 pand m11, m8 5689 pandn m8, m15 5690 SWAP m15, m8 5691 por m15, m11 5692 paddb m14, m6 5693 movq m10, r4q 5694 punpcklbw m10, m10 5695 psraw m10, 8 5696 pshufb m0, m14 5697 pshufb m1, m14 5698 pshufb m2, m14 5699 pshufb m3, m14 5700 pshufb m4, m14 5701 pshufb m5, m14 5702 pshufb m7, m14 5703 pmaddubsw m0, m15 5704 pmaddubsw m1, m15 5705 pmaddubsw m2, m15 5706 pmaddubsw m3, m15 5707 pmaddubsw m4, m15 5708 pmaddubsw m5, m15 5709 pmaddubsw m7, m15 5710 phaddw m0, m1 5711 phaddw m2, m3 5712 phaddw m4, m5 5713 phaddw m6, m7, m7 5714 pmulhrsw m0, m12 ; 0 1 5715 pmulhrsw m2, m12 ; 2 3 5716 pmulhrsw m4, m12 ; 4 5 5717 pmulhrsw m6, m12 ; 6 _ 5718 shufps m1, m0, m2, q1032 ; 1 2 5719 shufps m3, m2, m4, q1032 ; 3 4 5720 shufps m5, m4, m6, q1032 ; 5 6 5721 punpcklwd m7, m0, m1 ; 01 5722 punpckhwd m0, m1 ; 12 5723 punpcklwd m8, m2, m3 ; 23 5724 punpckhwd m2, m3 ; 34 5725 punpcklwd m9, m4, m5 ; 45 5726 punpckhwd m4, m5 ; 56 5727%else 5728 pxor m3, m3 5729 pcmpeqd m8, m3 5730 psrld m14, 10 5731 pshufb m14, [base+bdct_lb_dw] 5732 movu m1, [srcq+ssq*0] 5733 movu m2, [srcq+ssq*1] 5734 movu m3, [srcq+ssq*2] 5735 add srcq, ss3q 5736 punpcklqdq m6, m6 5737 SWAP m4, m7 5738 pand m7, m11, m8 5739 pandn m8, m15 5740 SWAP m5, m0 5741 por m15, m7 5742 paddb m14, m6 5743 movu m0, [srcq+ssq*0] 5744 movu m7, [srcq+ssq*1] 5745 movu m6, [srcq+ssq*2] 5746 pshufb m1, m14 5747 pshufb m2, m14 5748 pshufb m3, m14 5749 pshufb m0, m14 5750 pshufb m7, m14 5751 pshufb m6, m14 5752 pmaddubsw m1, m15 5753 pmaddubsw m2, m15 5754 pmaddubsw m3, m15 5755 mova [esp+0x00], m14 5756 mova [esp+0x10], m15 5757 pmaddubsw m0, m15 5758 pmaddubsw m7, m15 5759 pmaddubsw m6, m15 5760 phaddw m1, m2 5761 movu m2, [srcq+ss3q ] 5762 lea srcq, [srcq+ssq*4] 5763 mov r0, r0m 5764 phaddw m3, m0 5765 pshufb m2, m14 5766 pmaddubsw m2, m15 5767 %define m14 [esp+0x00] 5768 %define m15 [esp+0x10] 5769 phaddw m7, m6 5770 phaddw m2, m2 5771 movd m6, r4 5772 movd m0, r5 5773 punpckldq m6, m0 5774 punpcklbw m6, m6 5775 psraw m6, 8 5776 mova [esp+0x20], m6 5777 pmulhrsw m1, m12 ; 0 1 5778 pmulhrsw m3, m12 ; 2 3 5779 pmulhrsw m7, m12 ; 4 5 5780 pmulhrsw m2, m12 ; 6 _ 5781 shufps m0, m1, m3, q1032 ; 1 2 5782 shufps m4, m3, m7, q1032 ; 3 4 5783 shufps m5, m7, m2, q1032 ; 5 6 5784 punpcklwd m6, m1, m0 ; 01 5785 punpckhwd m1, m0 ; 12 5786 mova [esp+0x30], m1 5787 punpcklwd m1, m3, m4 ; 23 5788 punpckhwd m3, m4 ; 34 5789 mova [esp+0x40], m3 5790 punpcklwd m3, m7, m5 ; 45 5791 punpckhwd m7, m5 ; 56 5792 mova [esp+0x50], m7 5793 mova [esp+0x60], m2 5794 mova m0, [esp+0x20] 5795 %xdefine m8 m1 5796 %xdefine m9 m3 5797 %xdefine m10 m0 5798 SWAP m7, m6 5799 SWAP m1, m4 5800 SWAP m3, m2 5801%endif 5802 pshufd m1, m10, q0000 5803 pshufd m3, m10, q1111 5804 pshufd m5, m10, q2222 5805 pshufd m10, m10, q3333 5806%if ARCH_X86_64 5807 mova [rsp+0x00], m8 5808 mova [rsp+0x10], m2 5809 mova [rsp+0x20], m9 5810 mova [rsp+0x30], m4 5811%else 5812 mova [esp+0x70], m8 5813 mova [esp+0x80], m9 5814 mova [esp+0x90], m1 5815 mova [esp+0xa0], m3 5816 mova [esp+0xb0], m5 5817 mova [esp+0xc0], m10 5818 %ifidn %1, put 5819 mov dsd, dsm 5820 %endif 5821 %define m11 m6 5822%endif 5823.dy1_w4_loop: 5824%if ARCH_X86_64 5825 movu m11, [srcq+ssq*0] 5826 pmaddwd m7, m1 5827 pmaddwd m8, m3 5828 pmaddwd m0, m1 5829 pmaddwd m2, m3 5830 pmaddwd m9, m5 5831 pmaddwd m4, m5 5832 paddd m7, m8 5833 paddd m0, m2 5834 movu m8, [srcq+ssq*1] 5835 lea srcq, [srcq+ssq*2] 5836 pshufb m11, m14 5837 pmaddubsw m11, m15 5838 paddd m7, m13 5839 paddd m0, m13 5840 paddd m7, m9 5841 paddd m0, m4 5842 pshufb m8, m14 5843 pmaddubsw m8, m15 5844 phaddw m11, m8 5845 mova m8, [rsp+0x20] 5846 pmulhrsw m11, m12 5847 punpcklwd m9, m6, m11 ; 67 5848 psrldq m6, m11, 8 5849 punpcklwd m4, m11, m6 ; 78 5850 pmaddwd m2, m9, m10 5851 pmaddwd m11, m4, m10 5852 paddd m7, m2 5853 mova m2, [rsp+0x30] 5854 paddd m0, m11 5855%else 5856 SWAP m7, m6 5857 SWAP m1, m4 5858 SWAP m3, m2 5859 movu m5, [srcq+ssq*0] 5860 mova m0, [esp+0x30] 5861 mova m2, [esp+0x40] 5862 mova m4, [esp+0x50] 5863 pmaddwd m6, [esp+0x90] 5864 pmaddwd m1, [esp+0xa0] 5865 pmaddwd m0, [esp+0x90] 5866 pmaddwd m2, [esp+0xa0] 5867 pmaddwd m3, [esp+0xb0] 5868 pmaddwd m4, [esp+0xb0] 5869 paddd m6, m1 5870 paddd m0, m2 5871 movu m7, [srcq+ssq*1] 5872 lea srcq, [srcq+ssq*2] 5873 pshufb m5, m14 5874 pmaddubsw m5, m15 5875 paddd m6, m13 5876 paddd m0, m13 5877 paddd m6, m3 5878 paddd m0, m4 5879 pshufb m7, m14 5880 pmaddubsw m7, m15 5881 phaddw m5, m7 5882 mova m7, [rsp+0x80] 5883 pmulhrsw m5, m12 5884 punpcklwd m3, [esp+0x60], m5 ; 67 5885 psrldq m1, m5, 8 5886 punpcklwd m4, m5, m1 ; 78 5887 pmaddwd m2, m3, [esp+0xc0] 5888 pmaddwd m5, m4, [esp+0xc0] 5889 mova [esp+0x60], m1 5890 paddd m6, m2 5891 mova m2, [esp+0x50] 5892 paddd m0, m5 5893 SWAP m7, m6 5894%endif 5895 psrad m7, rndshift 5896 psrad m0, rndshift 5897 packssdw m7, m0 5898%if ARCH_X86_64 5899 mova m0, [rsp+0x10] 5900%else 5901 mova m0, [esp+0x40] 5902%define m11 m5 5903%endif 5904%ifidn %1, put 5905 packuswb m7, m7 5906 psrldq m11, m7, 4 5907 movd [dstq+dsq*0], m7 5908 movd [dstq+dsq*1], m11 5909 lea dstq, [dstq+dsq*2] 5910%else 5911 mova [tmpq], m7 5912 add tmpq, 16 5913%endif 5914 sub hd, 2 5915 jz .ret 5916%if ARCH_X86_64 5917 mova m7, [rsp+0x00] 5918 mova [rsp+0x00], m8 5919 mova [rsp+0x10], m2 5920 mova [rsp+0x20], m9 5921 mova [rsp+0x30], m4 5922%else 5923 mova m7, [esp+0x70] ; 01 5924 mova m1, [esp+0x80] ; 23 5925 mova m2, [esp+0x50] ; 34 5926 mova [esp+0x30], m0 5927 mova [esp+0x70], m1 5928 mova [esp+0x40], m2 5929 mova [esp+0x80], m3 5930 mova [esp+0x50], m4 5931%endif 5932 jmp .dy1_w4_loop 5933INIT_XMM ssse3 5934.dy1_w8: 5935 mov dword [rsp+0x90], 1 5936 movifprep tmp_stridem, 16 5937 jmp .dy1_w_start 5938.dy1_w16: 5939 mov dword [rsp+0x90], 2 5940 movifprep tmp_stridem, 32 5941 jmp .dy1_w_start 5942.dy1_w32: 5943 mov dword [rsp+0x90], 4 5944 movifprep tmp_stridem, 64 5945 jmp .dy1_w_start 5946.dy1_w64: 5947 mov dword [rsp+0x90], 8 5948 movifprep tmp_stridem, 128 5949 jmp .dy1_w_start 5950.dy1_w128: 5951 mov dword [rsp+0x90], 16 5952 movifprep tmp_stridem, 256 5953.dy1_w_start: 5954 mov myd, mym 5955%ifidn %1, put 5956 movifnidn dsm, dsq 5957%endif 5958%if ARCH_X86_64 5959 shr t0d, 16 5960 sub srcq, 3 5961 shr myd, 6 5962 mov r4d, 64 << 24 5963 lea myd, [t1+myq] 5964 cmovnz r4q, [base+subpel_filters+myq*8] 5965 movd m15, t0d 5966%else 5967 %define m8 m0 5968 %define m9 m1 5969 %xdefine m14 m4 5970 %xdefine m15 m3 5971 %if isprep 5972 %define ssq ssm 5973 %endif 5974 mov r5, [esp+0x1f0] 5975 mov r3, [esp+0x1f4] 5976 shr r5, 16 5977 sub srcq, 3 5978 movd m15, r5 5979 xor r5, r5 5980 shr myd, 6 5981 lea r3, [r3+myd] 5982 mov r4, 64 << 24 5983 cmovnz r4, [base+subpel_filters+r3*8+0] 5984 cmovnz r5, [base+subpel_filters+r3*8+4] 5985 mov r0, r0m 5986 mov r3, r3m 5987%endif 5988 pslld m7, m8, 2 ; dx*4 5989 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] 5990 pshufd m15, m15, q0000 5991 paddd m14, m8 ; mx+dx*[0-3] 5992%if ARCH_X86_64 5993 movq m3, r4q 5994 punpcklbw m3, m3 5995 psraw m3, 8 5996%else 5997 movd m5, r4 5998 movd m6, r5 5999 punpckldq m5, m6 6000 punpcklbw m5, m5 6001 psraw m5, 8 6002 SWAP m3, m5 6003%endif 6004 mova [rsp+0x100], m7 6005 mova [rsp+0x120], m15 6006 mov [rsp+0x098], srcq 6007 mov [rsp+0x130], r0q ; dstq / tmpq 6008 pshufd m0, m3, q0000 6009 pshufd m1, m3, q1111 6010 pshufd m2, m3, q2222 6011 pshufd m3, m3, q3333 6012 mova [rsp+0x140], m0 6013 mova [rsp+0x150], m1 6014 mova [rsp+0x160], m2 6015 mova [rsp+0x170], m3 6016%if ARCH_X86_64 && UNIX64 6017 mov hm, hd 6018%elif ARCH_X86_32 6019 SWAP m5, m3 6020 mov r5, hm 6021 mov [esp+0x134], r5 6022%endif 6023 jmp .dy1_hloop 6024.dy1_hloop_prep: 6025 dec dword [rsp+0x090] 6026 jz .ret 6027%if ARCH_X86_64 6028 add qword [rsp+0x130], 8*(isprep+1) 6029 mov hd, hm 6030%else 6031 add dword [rsp+0x130], 8*(isprep+1) 6032 mov r5, [esp+0x134] 6033 mov r0, [esp+0x130] 6034%endif 6035 mova m7, [rsp+0x100] 6036 mova m14, [rsp+0x110] 6037%if ARCH_X86_64 6038 mova m10, [base+pd_0x3ff] 6039%else 6040 %define m10 [base+pd_0x3ff] 6041%endif 6042 mova m15, [rsp+0x120] 6043 mov srcq, [rsp+0x098] 6044%if ARCH_X86_64 6045 mov r0q, [rsp+0x130] ; dstq / tmpq 6046%else 6047 mov hm, r5 6048 mov r0m, r0 6049 mov r3, r3m 6050%endif 6051 paddd m14, m7 6052.dy1_hloop: 6053 pxor m9, m9 6054%if ARCH_X86_64 6055 mova m11, [base+pq_0x40000000] 6056%else 6057 %define m11 [base+pq_0x40000000] 6058%endif 6059 psrld m2, m14, 10 6060 mova [rsp], m2 6061 pand m6, m14, m10 6062 psrld m6, 6 6063 paddd m5, m15, m6 6064 pcmpeqd m6, m9 6065 psrldq m2, m5, 8 6066%if ARCH_X86_64 6067 movd r4d, m5 6068 movd r6d, m2 6069 psrldq m5, 4 6070 psrldq m2, 4 6071 movd r7d, m5 6072 movd r9d, m2 6073 movq m0, [base+subpel_filters+r4*8] 6074 movq m1, [base+subpel_filters+r6*8] 6075 movhps m0, [base+subpel_filters+r7*8] 6076 movhps m1, [base+subpel_filters+r9*8] 6077%else 6078 movd r0, m5 6079 movd rX, m2 6080 psrldq m5, 4 6081 psrldq m2, 4 6082 movd r4, m5 6083 movd r5, m2 6084 movq m0, [base+subpel_filters+r0*8] 6085 movq m1, [base+subpel_filters+rX*8] 6086 movhps m0, [base+subpel_filters+r4*8] 6087 movhps m1, [base+subpel_filters+r5*8] 6088 pxor m2, m2 6089 %define m9 m2 6090%endif 6091 paddd m14, m7 ; mx+dx*[4-7] 6092 pand m5, m14, m10 6093 psrld m5, 6 6094 paddd m15, m5 6095 pcmpeqd m5, m9 6096 mova [rsp+0x110], m14 6097 psrldq m4, m15, 8 6098%if ARCH_X86_64 6099 movd r10d, m15 6100 movd r11d, m4 6101 psrldq m15, 4 6102 psrldq m4, 4 6103 movd r13d, m15 6104 movd rXd, m4 6105 movq m2, [base+subpel_filters+r10*8] 6106 movq m3, [base+subpel_filters+r11*8] 6107 movhps m2, [base+subpel_filters+r13*8] 6108 movhps m3, [base+subpel_filters+ rX*8] 6109 psrld m14, 10 6110 psrldq m4, m14, 8 6111 movd r10d, m14 6112 movd r11d, m4 6113 psrldq m14, 4 6114 psrldq m4, 4 6115 movd r13d, m14 6116 movd rXd, m4 6117 mov r4d, [rsp+ 0] 6118 mov r6d, [rsp+ 8] 6119 mov r7d, [rsp+ 4] 6120 mov r9d, [rsp+12] 6121 pshufd m4, m6, q1100 6122 pshufd m6, m6, q3322 6123 pshufd m7, m5, q1100 6124 pshufd m5, m5, q3322 6125 pand m8, m11, m4 6126 pand m9, m11, m6 6127 pand m15, m11, m7 6128 pand m11, m11, m5 6129 pandn m4, m0 6130 pandn m6, m1 6131 pandn m7, m2 6132 pandn m5, m3 6133 por m8, m4 6134 por m9, m6 6135 por m15, m7 6136 por m11, m5 6137 mova [rsp+0x10], m8 6138 mova [rsp+0x20], m9 6139 mova [rsp+0x30], m15 6140 mova [rsp+0x40], m11 6141 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1 6142 mova [rsp+0x50], m1 6143 mova [rsp+0x60], m2 6144 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3 6145 mova [rsp+0x70], m3 6146 mova [rsp+0x80], m4 6147 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5 6148 MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7 6149 SWAP m7, m0 6150 SWAP m8, m14 6151 mova m1, [rsp+0x50] 6152 mova m2, [rsp+0x60] 6153 mova m3, [rsp+0x70] 6154 mova m15, [rsp+0x80] 6155 punpcklwd m4, m5, m6 ; 45a 6156 punpckhwd m5, m6 ; 45b 6157 punpcklwd m6, m7, m8 ; 67a 6158 punpckhwd m7, m8 ; 67b 6159 SWAP m14, m8 6160 mova m8, [rsp+0x140] 6161 mova m9, [rsp+0x150] 6162 mova m10, [rsp+0x160] 6163 mova m11, [rsp+0x170] 6164 punpcklwd m0, m1, m2 ; 01a 6165 punpckhwd m1, m2 ; 01b 6166 punpcklwd m2, m3, m15; 23a 6167 punpckhwd m3, m15 ; 23b 6168 mova [rsp+0x50], m4 6169 mova [rsp+0x60], m5 6170 mova [rsp+0x70], m6 6171 mova [rsp+0x80], m7 6172 mova m14, [base+unpckw] 6173%else 6174 movd r0, m15 6175 movd rX, m4 6176 psrldq m15, 4 6177 psrldq m4, 4 6178 movd r4, m15 6179 movd r5, m4 6180 mova m14, [esp+0x110] 6181 movq m2, [base+subpel_filters+r0*8] 6182 movq m3, [base+subpel_filters+rX*8] 6183 movhps m2, [base+subpel_filters+r4*8] 6184 movhps m3, [base+subpel_filters+r5*8] 6185 psrld m14, 10 6186 mova [esp+16], m14 6187 mov r0, [esp+ 0] 6188 mov rX, [esp+ 8] 6189 mov r4, [esp+ 4] 6190 mov r5, [esp+12] 6191 mova [esp+0x20], m0 6192 mova [esp+0x30], m1 6193 mova [esp+0x40], m2 6194 mova [esp+0x50], m3 6195 pshufd m4, m6, q1100 6196 pshufd m6, m6, q3322 6197 pshufd m7, m5, q1100 6198 pshufd m5, m5, q3322 6199 pand m0, m11, m4 6200 pand m1, m11, m6 6201 pand m2, m11, m7 6202 pand m3, m11, m5 6203 pandn m4, [esp+0x20] 6204 pandn m6, [esp+0x30] 6205 pandn m7, [esp+0x40] 6206 pandn m5, [esp+0x50] 6207 por m0, m4 6208 por m1, m6 6209 por m2, m7 6210 por m3, m5 6211 mova [esp+0x20], m0 6212 mova [esp+0x30], m1 6213 mova [esp+0x40], m2 6214 mova [esp+0x50], m3 6215 MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1 6216 MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3 6217 MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5 6218 MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7 6219 mova m5, [esp+0x1a0] 6220 mova m6, [esp+0x1b0] 6221 mova m7, [esp+0x1c0] 6222 mova m0, [esp+0x1d0] 6223 punpcklwd m4, m5, m6 ; 45a 6224 punpckhwd m5, m6 ; 45b 6225 punpcklwd m6, m7, m0 ; 67a 6226 punpckhwd m7, m0 ; 67b 6227 mova [esp+0x1a0], m4 6228 mova [esp+0x1b0], m5 6229 mova [esp+0x1c0], m6 6230 mova [esp+0x1d0], m7 6231 mova m1, [esp+0x060] 6232 mova m2, [esp+0x070] 6233 mova m3, [esp+0x180] 6234 mova m4, [esp+0x190] 6235 punpcklwd m0, m1, m2 ; 01a 6236 punpckhwd m1, m2 ; 01b 6237 punpcklwd m2, m3, m4 ; 23a 6238 punpckhwd m3, m4 ; 23b 6239 mova [esp+0x060], m0 6240 mova [esp+0x070], m1 6241 mova [esp+0x180], m2 6242 mova [esp+0x190], m3 6243 %define m8 [esp+0x140] 6244 %define m9 [esp+0x150] 6245 %define m10 [esp+0x160] 6246 %define m11 [esp+0x170] 6247%endif 6248.dy1_vloop: 6249%if ARCH_X86_32 6250 mov r0, r0m 6251%endif 6252 pmaddwd m4, m0, m8 6253 pmaddwd m5, m1, m8 6254 pmaddwd m6, m2, m9 6255 pmaddwd m7, m3, m9 6256 paddd m4, m13 6257 paddd m5, m13 6258 paddd m4, m6 6259 paddd m5, m7 6260%if ARCH_X86_64 6261 pmaddwd m6, [rsp+0x50], m10 6262 pmaddwd m7, [rsp+0x60], m10 6263%else 6264 pmaddwd m6, [rsp+0x1a0], m10 6265 pmaddwd m7, [rsp+0x1b0], m10 6266%endif 6267 paddd m4, m6 6268 paddd m5, m7 6269%if ARCH_X86_64 6270 pmaddwd m6, [rsp+0x70], m11 6271 pmaddwd m7, [rsp+0x80], m11 6272%else 6273 pmaddwd m6, [rsp+0x1c0], m11 6274 pmaddwd m7, [rsp+0x1d0], m11 6275%endif 6276 paddd m4, m6 6277 paddd m5, m7 6278 psrad m4, rndshift 6279 psrad m5, rndshift 6280 packssdw m4, m5 6281%ifidn %1, put 6282 packuswb m4, m4 6283 movq [dstq], m4 6284 add dstq, dsm 6285%else 6286 mova [tmpq], m4 6287 add tmpq, tmp_stridem 6288%endif 6289%if ARCH_X86_32 6290 mov r0m, r0 6291%endif 6292 dec hd 6293 jz .dy1_hloop_prep 6294%if ARCH_X86_64 6295 movq m4, [srcq+ r4] 6296 movq m5, [srcq+ r6] 6297 movhps m4, [srcq+ r7] 6298 movhps m5, [srcq+ r9] 6299 movq m6, [srcq+r10] 6300 movq m7, [srcq+r11] 6301 movhps m6, [srcq+r13] 6302 movhps m7, [srcq+ rX] 6303 add srcq, ssq 6304 pshufd m15, m14, q1032 6305 pshufb m0, m14 ; 0a 1a 6306 pshufb m1, m14 ; 0b 1b 6307 pshufb m2, m15 ; 3a 2a 6308 pshufb m3, m15 ; 3b 2b 6309 pmaddubsw m4, [rsp+0x10] 6310 pmaddubsw m5, [rsp+0x20] 6311 pmaddubsw m6, [rsp+0x30] 6312 pmaddubsw m7, [rsp+0x40] 6313 phaddw m4, m5 6314 phaddw m6, m7 6315 phaddw m4, m6 6316 pmulhrsw m4, m12 6317 pshufb m5, [rsp+0x70], m15 ; 7a 6a 6318 pshufb m7, [rsp+0x80], m15 ; 7b 6b 6319 pshufb m6, [rsp+0x50], m14 ; 4a 5a 6320 pshufb m15, [rsp+0x60], m14 ; 4b 5b 6321 punpckhwd m0, m2 ; 12a 6322 punpckhwd m1, m3 ; 12b 6323 punpcklwd m2, m6 ; 34a 6324 punpcklwd m3, m15 ; 34b 6325 punpckhwd m6, m5 ; 56a 6326 punpckhwd m15, m7 ; 56b 6327 punpcklwd m5, m4 ; 78a 6328 psrldq m4, 8 6329 punpcklwd m7, m4 ; 78b 6330 mova [rsp+0x50], m6 6331 mova [rsp+0x60], m15 6332 mova [rsp+0x70], m5 6333 mova [rsp+0x80], m7 6334%else 6335 mov r0, [esp+ 0] 6336 mov rX, [esp+ 8] 6337 mov r4, [esp+ 4] 6338 mov r5, [esp+12] 6339 mova m6, [base+unpckw] 6340 mova m0, [esp+0x060] 6341 mova m1, [esp+0x070] 6342 mova m7, [esp+0x1a0] 6343 movq m4, [srcq+r0] 6344 movq m5, [srcq+rX] 6345 movhps m4, [srcq+r4] 6346 movhps m5, [srcq+r5] 6347 pshufb m0, m6 ; 0a 1a 6348 pshufb m1, m6 ; 0b 1b 6349 pshufb m7, m6 ; 4a 5a 6350 mov r0, [esp+16] 6351 mov rX, [esp+24] 6352 mov r4, [esp+20] 6353 mov r5, [esp+28] 6354 movq m3, [srcq+r0] 6355 movq m2, [srcq+rX] 6356 movhps m3, [srcq+r4] 6357 movhps m2, [srcq+r5] 6358 add srcq, ssq 6359 pmaddubsw m4, [esp+0x20] 6360 pmaddubsw m5, [esp+0x30] 6361 pmaddubsw m3, [esp+0x40] 6362 pmaddubsw m2, [esp+0x50] 6363 phaddw m4, m5 6364 phaddw m3, m2 6365 mova m5, [esp+0x1b0] 6366 mova m2, [esp+0x180] 6367 phaddw m4, m3 6368 mova m3, [esp+0x190] 6369 pmulhrsw m4, m12 ; 8a 8b 6370 pshufb m5, m6 ; 4b 5b 6371 pshufd m6, m6, q1032 6372 pshufb m2, m6 ; 3a 2a 6373 pshufb m3, m6 ; 3b 2b 6374 punpckhwd m0, m2 ; 12a 6375 punpckhwd m1, m3 ; 12b 6376 mova [esp+0x60], m0 6377 mova [esp+0x70], m1 6378 mova m0, [esp+0x1c0] 6379 mova m1, [esp+0x1d0] 6380 punpcklwd m2, m7 ; 34a 6381 punpcklwd m3, m5 ; 34b 6382 mova [esp+0x180], m2 6383 mova [esp+0x190], m3 6384 pshufb m0, m6 ; 7a 6a 6385 pshufb m1, m6 ; 7b 6b 6386 punpckhwd m7, m0 ; 56a 6387 punpckhwd m5, m1 ; 56b 6388 punpcklwd m0, m4 6389 punpckhqdq m4, m4 6390 punpcklwd m1, m4 6391 mova [esp+0x1a0], m7 6392 mova [esp+0x1b0], m5 6393 mova [esp+0x1c0], m0 6394 mova [esp+0x1d0], m1 6395 mova m0, [esp+0x60] 6396 mova m1, [esp+0x70] 6397%endif 6398 jmp .dy1_vloop 6399INIT_XMM ssse3 6400.dy2: 6401 movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] 6402 add wq, base_reg 6403 jmp wq 6404%ifidn %1, put 6405.dy2_w2: 6406 %if ARCH_X86_64 6407 mov myd, mym 6408 movzx t0d, t0b 6409 dec srcq 6410 movd m15, t0d 6411 %else 6412 %define m10 [base+pd_0x3ff] 6413 %define m11 [base+pd_0x4000] 6414 %define m8 m0 6415 %define m9 m1 6416 %define m14 m4 6417 %define m15 m3 6418 movzx r5, byte [esp+0x1f0] 6419 dec srcd 6420 movd m15, r5 6421 %endif 6422 punpckldq m9, m8 6423 SWAP m8, m9 6424 paddd m14, m8 ; mx+dx*[0-1] 6425 %if ARCH_X86_64 6426 mova m11, [base+pd_0x4000] 6427 %endif 6428 pshufd m15, m15, q0000 6429 pand m8, m14, m10 6430 psrld m8, 6 6431 paddd m15, m8 6432 movd r4d, m15 6433 psrldq m15, 4 6434 %if ARCH_X86_64 6435 movd r6d, m15 6436 %else 6437 movd r3d, m15 6438 %endif 6439 mova m5, [base+bdct_lb_dw] 6440 mova m6, [base+subpel_s_shuf2] 6441 movd m15, [base+subpel_filters+r4*8+2] 6442 %if ARCH_X86_64 6443 movd m7, [base+subpel_filters+r6*8+2] 6444 %else 6445 movd m7, [base+subpel_filters+r3*8+2] 6446 %endif 6447 pxor m9, m9 6448 pcmpeqd m8, m9 6449 psrld m14, 10 6450 %if ARCH_X86_32 6451 mov r3, r3m 6452 pshufb m14, m5 6453 paddb m14, m6 6454 mova [esp+0x00], m14 6455 %define m14 [esp+0x00] 6456 SWAP m5, m0 6457 SWAP m6, m3 6458 %define m8 m5 6459 %define m15 m6 6460 %endif 6461 movq m0, [srcq+ssq*0] 6462 movq m1, [srcq+ssq*1] 6463 movhps m0, [srcq+ssq*2] 6464 movhps m1, [srcq+ss3q ] 6465 lea srcq, [srcq+ssq*4] 6466 %if ARCH_X86_64 6467 shr myd, 6 6468 mov r4d, 64 << 24 6469 lea myd, [t1+myq] 6470 cmovnz r4q, [base+subpel_filters+myq*8] 6471 pshufb m14, m5 6472 paddb m14, m6 6473 movq m10, r4q 6474 %else 6475 mov myd, mym 6476 mov r3, [esp+0x1f4] 6477 xor r5, r5 6478 shr myd, 6 6479 lea r3, [r3+myd] 6480 mov r4, 64 << 24 6481 cmovnz r4, [base+subpel_filters+r3*8+0] 6482 cmovnz r5, [base+subpel_filters+r3*8+4] 6483 mov r3, r3m 6484 %define m10 m4 6485 movd m10, r4 6486 movd m3, r5 6487 punpckldq m10, m3 6488 %endif 6489 movq m3, [srcq+ssq*0] 6490 movhps m3, [srcq+ssq*1] 6491 lea srcq, [srcq+ssq*2] 6492 punpcklbw m10, m10 6493 psraw m10, 8 6494 punpckldq m15, m7 6495 punpcklqdq m15, m15 6496 %if ARCH_X86_64 6497 pand m11, m8 6498 %else 6499 pand m7, m11, m8 6500 %define m11 m7 6501 %endif 6502 pandn m8, m15 6503 SWAP m15, m8 6504 por m15, m11 6505 %if ARCH_X86_64 6506 pshufd m8, m10, q0000 6507 pshufd m9, m10, q1111 6508 pshufd m11, m10, q3333 6509 pshufd m10, m10, q2222 6510 %else 6511 mova [esp+0x10], m15 6512 %define m15 [esp+0x10] 6513 mov r5, r0m 6514 %define dstq r5 6515 mov dsd, dsm 6516 pshufd m5, m4, q0000 6517 pshufd m6, m4, q1111 6518 pshufd m7, m4, q2222 6519 pshufd m4, m4, q3333 6520 %define m8 [esp+0x20] 6521 %define m9 [esp+0x30] 6522 %define m10 [esp+0x40] 6523 %define m11 [esp+0x50] 6524 mova m8, m5 6525 mova m9, m6 6526 mova m10, m7 6527 mova m11, m4 6528 %endif 6529 pshufb m0, m14 6530 pshufb m1, m14 6531 pshufb m3, m14 6532 pmaddubsw m0, m15 6533 pmaddubsw m1, m15 6534 pmaddubsw m3, m15 6535 pslldq m2, m3, 8 6536 phaddw m0, m2 6537 phaddw m1, m3 6538 pmulhrsw m0, m12 ; 0 2 _ 4 6539 pmulhrsw m1, m12 ; 1 3 _ 5 6540 pshufd m2, m0, q3110 ; 0 2 2 4 6541 pshufd m1, m1, q3110 ; 1 3 3 5 6542 punpcklwd m3, m2, m1 ; 01 23 6543 punpckhwd m2, m1 ; 23 45 6544.dy2_w2_loop: 6545 movq m6, [srcq+ssq*0] 6546 movq m7, [srcq+ssq*1] 6547 movhps m6, [srcq+ssq*2] 6548 movhps m7, [srcq+ss3q ] 6549 lea srcq, [srcq+ssq*4] 6550 pmaddwd m4, m3, m8 6551 pmaddwd m5, m2, m9 6552 pshufb m6, m14 6553 pshufb m7, m14 6554 pmaddubsw m6, m15 6555 pmaddubsw m7, m15 6556 phaddw m6, m7 6557 pmulhrsw m6, m12 6558 psrldq m7, m6, 8 6559 palignr m6, m0, 8 6560 palignr m7, m1, 8 6561 mova m0, m6 6562 mova m1, m7 6563 pshufd m6, m6, q3221 6564 pshufd m7, m7, q3221 6565 punpcklwd m3, m6, m7 ; 45 67 6566 punpckhwd m2, m6, m7 ; 67 89 6567 pmaddwd m6, m3, m10 6568 pmaddwd m7, m2, m11 6569 paddd m4, m5 6570 paddd m4, m13 6571 paddd m6, m7 6572 paddd m4, m6 6573 psrad m4, rndshift 6574 packssdw m4, m4 6575 packuswb m4, m4 6576 movd r4d, m4 6577 mov [dstq+dsq*0], r4w 6578 shr r4d, 16 6579 mov [dstq+dsq*1], r4w 6580 lea dstq, [dstq+dsq*2] 6581 sub hd, 2 6582 jg .dy2_w2_loop 6583 RET 6584%endif 6585INIT_XMM ssse3 6586.dy2_w4: 6587%if ARCH_X86_64 6588 mov myd, mym 6589 movzx t0d, t0b 6590 dec srcq 6591 movd m15, t0d 6592%else 6593 %define m10 [base+pd_0x3ff] 6594 %define m11 [base+pd_0x4000] 6595 %define m8 m0 6596 %xdefine m14 m4 6597 %define m15 m3 6598 %define dstq r0 6599 %if isprep 6600 %define ssq r3 6601 %endif 6602 movzx r4, byte [esp+0x1f0] 6603 dec srcq 6604 movd m15, r4 6605%endif 6606 pmaddwd m8, [base+rescale_mul] 6607%if ARCH_X86_64 6608 mova m11, [base+pd_0x4000] 6609%endif 6610 pshufd m15, m15, q0000 6611 paddd m14, m8 ; mx+dx*[0-3] 6612 pand m8, m14, m10 6613 psrld m8, 6 6614 paddd m15, m8 6615 psrldq m7, m15, 8 6616%if ARCH_X86_64 6617 movd r4d, m15 6618 movd r11d, m7 6619 psrldq m15, 4 6620 psrldq m7, 4 6621 movd r6d, m15 6622 movd r13d, m7 6623 movd m15, [base+subpel_filters+ r4*8+2] 6624 movd m2, [base+subpel_filters+r11*8+2] 6625 movd m3, [base+subpel_filters+ r6*8+2] 6626 movd m4, [base+subpel_filters+r13*8+2] 6627 movq m6, [base+subpel_s_shuf2] 6628 shr myd, 6 6629 mov r4d, 64 << 24 6630 lea myd, [t1+myq] 6631 cmovnz r4q, [base+subpel_filters+myq*8] 6632%else 6633 movd r1, m15 6634 movd r3, m7 6635 psrldq m15, 4 6636 psrldq m7, 4 6637 movd r4, m15 6638 movd r5, m7 6639 %define m15 m5 6640 SWAP m4, m7 6641 movd m15, [base+subpel_filters+r1*8+2] 6642 movd m2, [base+subpel_filters+r3*8+2] 6643 movd m3, [base+subpel_filters+r4*8+2] 6644 movd m4, [base+subpel_filters+r5*8+2] 6645 movq m6, [base+subpel_s_shuf2] 6646 mov myd, mym 6647 mov r3, [esp+0x1f4] 6648 xor r5, r5 6649 shr myd, 6 6650 lea r3, [r3+myd] 6651 mov r4, 64 << 24 6652 cmovnz r4, [base+subpel_filters+r3*8+0] 6653 cmovnz r5, [base+subpel_filters+r3*8+4] 6654 mov r3, r3m 6655 %if isprep 6656 lea ss3q, [ssq*3] 6657 %endif 6658%endif 6659 punpckldq m15, m3 6660 punpckldq m2, m4 6661 punpcklqdq m15, m2 6662%if ARCH_X86_64 6663 pcmpeqd m8, m9 6664 psrld m14, 10 6665 movu m0, [srcq+ssq*0] 6666 movu m2, [srcq+ssq*2] 6667 movu m1, [srcq+ssq*1] 6668 movu m3, [srcq+ss3q ] 6669 lea srcq, [srcq+ssq*4] 6670 punpcklqdq m6, m6 6671 pshufb m14, [base+bdct_lb_dw] 6672 movu m4, [srcq+ssq*0] 6673 movu m5, [srcq+ssq*1] 6674 lea srcq, [srcq+ssq*2] 6675 pand m11, m8 6676 pandn m8, m15 6677 SWAP m15, m8 6678 por m15, m11 6679 paddb m14, m6 6680 movq m11, r4q 6681 punpcklbw m11, m11 6682 psraw m11, 8 6683 pshufb m0, m14 6684 pshufb m2, m14 6685 pshufb m1, m14 6686 pshufb m3, m14 6687 pshufb m4, m14 6688 pshufb m5, m14 6689 pmaddubsw m0, m15 6690 pmaddubsw m2, m15 6691 pmaddubsw m1, m15 6692 pmaddubsw m3, m15 6693 pmaddubsw m4, m15 6694 pmaddubsw m5, m15 6695 phaddw m0, m2 6696 phaddw m1, m3 6697 phaddw m4, m5 6698 pmulhrsw m0, m12 ; 0 2 6699 pmulhrsw m1, m12 ; 1 3 6700 pmulhrsw m4, m12 ; 4 5 6701 pshufd m8, m11, q0000 6702 pshufd m9, m11, q1111 6703 pshufd m10, m11, q2222 6704 pshufd m11, m11, q3333 6705%else 6706 pxor m3, m3 6707 pcmpeqd m8, m3 6708 psrld m14, 10 6709 pshufb m14, [base+bdct_lb_dw] 6710 movu m1, [srcq+ssq*0] 6711 movu m2, [srcq+ssq*2] 6712 movu m3, [srcq+ssq*1] 6713 add srcq, ss3q 6714 punpcklqdq m6, m6 6715 SWAP m4, m7 6716 pand m7, m11, m8 6717 pandn m8, m15 6718 SWAP m15, m8 6719 por m15, m7 6720 paddb m14, m6 6721 movu m0, [srcq+ssq*0] 6722 movu m7, [srcq+ssq*1] 6723 movu m6, [srcq+ssq*2] 6724 add srcq, ss3q 6725 pshufb m1, m14 6726 pshufb m2, m14 6727 pshufb m3, m14 6728 pshufb m0, m14 6729 pshufb m7, m14 6730 pshufb m6, m14 6731 pmaddubsw m1, m15 6732 pmaddubsw m2, m15 6733 pmaddubsw m3, m15 6734 mova [esp+0x00], m14 6735 mova [esp+0x10], m15 6736 pmaddubsw m0, m15 6737 pmaddubsw m7, m15 6738 pmaddubsw m6, m15 6739 %define m14 [esp+0x00] 6740 %define m15 [esp+0x10] 6741 phaddw m1, m2 6742 phaddw m3, m0 6743 phaddw m7, m6 6744 %ifidn %1, put 6745 mov dsd, dsm 6746 %define dstq r5 6747 %else 6748 %define tmpq r5 6749 %endif 6750 movd m6, r4 6751 movd m0, r5 6752 punpckldq m6, m0 6753 punpcklbw m6, m6 6754 psraw m6, 8 6755 mov r5, r0m 6756 pmulhrsw m1, m12 ; 0 2 6757 pmulhrsw m3, m12 ; 1 3 6758 pmulhrsw m7, m12 ; 4 5 6759 SWAP m0, m1, m3 6760 SWAP m4, m7 6761 pshufd m2, m6, q0000 6762 pshufd m3, m6, q1111 6763 pshufd m7, m6, q2222 6764 pshufd m6, m6, q3333 6765 mova [esp+0x30], m2 6766 mova [esp+0x40], m3 6767 mova [esp+0x50], m7 6768 mova [esp+0x60], m6 6769 %define m8 [esp+0x30] 6770 %define m9 [esp+0x40] 6771 %define m10 [esp+0x50] 6772 %define m11 [esp+0x60] 6773%endif 6774 psrldq m5, m4, 8 ; 5 _ 6775 punpckhwd m2, m0, m1 ; 23 6776 punpcklwd m0, m1 ; 01 6777 punpcklwd m4, m5 ; 45 6778.dy2_w4_loop: 6779 pmaddwd m0, m8 ; a0 6780 pmaddwd m5, m2, m8 ; b0 6781 pmaddwd m2, m9 ; a1 6782 pmaddwd m7, m4, m9 ; b1 6783 pmaddwd m3, m4, m10 ; a2 6784 paddd m0, m13 6785 paddd m5, m13 6786 paddd m0, m2 6787 paddd m5, m7 6788 paddd m0, m3 6789 movu m6, [srcq+ssq*0] 6790 movu m7, [srcq+ssq*1] 6791 movu m3, [srcq+ssq*2] 6792 movu m1, [srcq+ss3q ] 6793 lea srcq, [srcq+ssq*4] 6794 pshufb m6, m14 6795 pshufb m7, m14 6796 pshufb m3, m14 6797 pshufb m1, m14 6798 pmaddubsw m6, m15 6799 pmaddubsw m7, m15 6800 pmaddubsw m3, m15 6801 pmaddubsw m1, m15 6802 phaddw m6, m7 6803 phaddw m3, m1 6804 pmulhrsw m6, m12 ; 6 7 6805 pmulhrsw m3, m12 ; 8 9 6806 psrldq m7, m6, 8 6807 psrldq m1, m3, 8 6808 punpcklwd m6, m7 ; 67 6809 punpcklwd m3, m1 ; 89 6810 mova m2, m6 6811 pmaddwd m1, m6, m10 ; b2 6812 pmaddwd m6, m11 ; a3 6813 pmaddwd m7, m3, m11 ; b3 6814 paddd m5, m1 6815 paddd m0, m6 6816 paddd m5, m7 6817 psrad m0, rndshift 6818 psrad m5, rndshift 6819 packssdw m0, m5 6820%ifidn %1, put 6821 packuswb m0, m0 6822 psrldq m1, m0, 4 6823 movd [dstq+dsq*0], m0 6824 movd [dstq+dsq*1], m1 6825 lea dstq, [dstq+dsq*2] 6826%else 6827 mova [tmpq], m0 6828 add tmpq, 16 6829%endif 6830 mova m0, m4 6831 mova m4, m3 6832 sub hd, 2 6833 jg .dy2_w4_loop 6834 MC_8TAP_SCALED_RET 6835INIT_XMM ssse3 6836.dy2_w8: 6837 mov dword [rsp+0x90], 1 6838 movifprep tmp_stridem, 16 6839 jmp .dy2_w_start 6840.dy2_w16: 6841 mov dword [rsp+0x90], 2 6842 movifprep tmp_stridem, 32 6843 jmp .dy2_w_start 6844.dy2_w32: 6845 mov dword [rsp+0x90], 4 6846 movifprep tmp_stridem, 64 6847 jmp .dy2_w_start 6848.dy2_w64: 6849 mov dword [rsp+0x90], 8 6850 movifprep tmp_stridem, 128 6851 jmp .dy2_w_start 6852.dy2_w128: 6853 mov dword [rsp+0x90], 16 6854 movifprep tmp_stridem, 256 6855.dy2_w_start: 6856 mov myd, mym 6857%ifidn %1, put 6858 movifnidn dsm, dsq 6859%endif 6860%if ARCH_X86_64 6861 shr t0d, 16 6862 sub srcq, 3 6863 shr myd, 6 6864 mov r4d, 64 << 24 6865 lea myd, [t1+myq] 6866 cmovnz r4q, [base+subpel_filters+myq*8] 6867 movd m15, t0d 6868%else 6869 %define m10 [base+pd_0x3ff] 6870 %define m11 [base+pd_0x4000] 6871 %define m8 m0 6872 %define m9 m1 6873 %xdefine m14 m4 6874 %xdefine m15 m3 6875 %if isprep 6876 %define tmpq r0 6877 %define ssq ssm 6878 %else 6879 %define dstq r0 6880 %endif 6881 mov r5, [esp+0x1f0] 6882 mov r3, [esp+0x1f4] 6883 shr r5, 16 6884 sub srcq, 3 6885 movd m15, r5 6886 xor r5, r5 6887 shr myd, 6 6888 lea r3, [r3+myd] 6889 mov r4, 64 << 24 6890 cmovnz r4, [base+subpel_filters+r3*8+0] 6891 cmovnz r5, [base+subpel_filters+r3*8+4] 6892 mov r0, r0m 6893 mov r3, r3m 6894%endif 6895 pslld m7, m8, 2 ; dx*4 6896 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] 6897 pshufd m15, m15, q0000 6898 paddd m14, m8 ; mx+dx*[0-3] 6899%if ARCH_X86_64 6900 movq m3, r4q 6901 punpcklbw m3, m3 6902 psraw m3, 8 6903%else 6904 movd m5, r4 6905 movd m6, r5 6906 punpckldq m5, m6 6907 punpcklbw m5, m5 6908 psraw m5, 8 6909 SWAP m3, m5 6910%endif 6911 mova [rsp+0x100], m7 6912 mova [rsp+0x120], m15 6913 mov [rsp+0x098], srcq 6914 mov [rsp+0x130], r0q ; dstq / tmpq 6915 pshufd m0, m3, q0000 6916 pshufd m1, m3, q1111 6917 pshufd m2, m3, q2222 6918 pshufd m3, m3, q3333 6919 mova [rsp+0x140], m0 6920 mova [rsp+0x150], m1 6921 mova [rsp+0x160], m2 6922 mova [rsp+0x170], m3 6923%if ARCH_X86_64 && UNIX64 6924 mov hm, hd 6925%elif ARCH_X86_32 6926 SWAP m5, m3 6927 mov r5, hm 6928 mov [esp+0x134], r5 6929%endif 6930 jmp .dy2_hloop 6931.dy2_hloop_prep: 6932 dec dword [rsp+0x090] 6933 jz .ret 6934%if ARCH_X86_64 6935 add qword [rsp+0x130], 8*(isprep+1) 6936 mov hd, hm 6937%else 6938 add dword [rsp+0x130], 8*(isprep+1) 6939 mov r5, [esp+0x134] 6940 mov r0, [esp+0x130] 6941%endif 6942 mova m7, [rsp+0x100] 6943 mova m14, [rsp+0x110] 6944%if ARCH_X86_64 6945 mova m10, [base+pd_0x3ff] 6946%else 6947 %define m10 [base+pd_0x3ff] 6948%endif 6949 mova m15, [rsp+0x120] 6950 mov srcq, [rsp+0x098] 6951%if ARCH_X86_64 6952 mov r0q, [rsp+0x130] ; dstq / tmpq 6953%else 6954 mov hm, r5 6955 mov r0m, r0 6956 mov r3, r3m 6957%endif 6958 paddd m14, m7 6959.dy2_hloop: 6960 pxor m9, m9 6961%if ARCH_X86_64 6962 mova m11, [base+pq_0x40000000] 6963%else 6964 %define m11 [base+pq_0x40000000] 6965%endif 6966 psrld m2, m14, 10 6967 mova [rsp], m2 6968 pand m6, m14, m10 6969 psrld m6, 6 6970 paddd m5, m15, m6 6971 pcmpeqd m6, m9 6972 psrldq m2, m5, 8 6973%if ARCH_X86_64 6974 movd r4d, m5 6975 movd r6d, m2 6976 psrldq m5, 4 6977 psrldq m2, 4 6978 movd r7d, m5 6979 movd r9d, m2 6980 movq m0, [base+subpel_filters+r4*8] 6981 movq m1, [base+subpel_filters+r6*8] 6982 movhps m0, [base+subpel_filters+r7*8] 6983 movhps m1, [base+subpel_filters+r9*8] 6984%else 6985 movd r0, m5 6986 movd rX, m2 6987 psrldq m5, 4 6988 psrldq m2, 4 6989 movd r4, m5 6990 movd r5, m2 6991 movq m0, [base+subpel_filters+r0*8] 6992 movq m1, [base+subpel_filters+rX*8] 6993 movhps m0, [base+subpel_filters+r4*8] 6994 movhps m1, [base+subpel_filters+r5*8] 6995 pxor m2, m2 6996 %define m9 m2 6997%endif 6998 paddd m14, m7 ; mx+dx*[4-7] 6999 pand m5, m14, m10 7000 psrld m5, 6 7001 paddd m15, m5 7002 pcmpeqd m5, m9 7003 mova [rsp+0x110], m14 7004 psrldq m4, m15, 8 7005%if ARCH_X86_64 7006 movd r10d, m15 7007 movd r11d, m4 7008 psrldq m15, 4 7009 psrldq m4, 4 7010 movd r13d, m15 7011 movd rXd, m4 7012 movq m2, [base+subpel_filters+r10*8] 7013 movq m3, [base+subpel_filters+r11*8] 7014 movhps m2, [base+subpel_filters+r13*8] 7015 movhps m3, [base+subpel_filters+ rX*8] 7016 psrld m14, 10 7017 psrldq m4, m14, 8 7018 movd r10d, m14 7019 movd r11d, m4 7020 psrldq m14, 4 7021 psrldq m4, 4 7022 movd r13d, m14 7023 movd rXd, m4 7024 mov r4d, [rsp+ 0] 7025 mov r6d, [rsp+ 8] 7026 mov r7d, [rsp+ 4] 7027 mov r9d, [rsp+12] 7028 pshufd m4, m6, q1100 7029 pshufd m6, m6, q3322 7030 pshufd m7, m5, q1100 7031 pshufd m5, m5, q3322 7032 pand m8, m11, m4 7033 pand m9, m11, m6 7034 pand m15, m11, m7 7035 pand m11, m11, m5 7036 pandn m4, m0 7037 pandn m6, m1 7038 pandn m7, m2 7039 pandn m5, m3 7040 por m8, m4 7041 por m9, m6 7042 por m15, m7 7043 por m11, m5 7044 mova [rsp+0x10], m8 7045 mova [rsp+0x20], m9 7046 mova [rsp+0x30], m15 7047 mova [rsp+0x40], m11 7048 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1 7049 mova [rsp+0x50], m1 7050 mova [rsp+0x60], m2 7051 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3 7052 mova [rsp+0x70], m3 7053 mova [rsp+0x80], m4 7054 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5 7055 MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7 7056 SWAP m7, m0 7057 SWAP m8, m14 7058 mova m1, [rsp+0x50] 7059 mova m2, [rsp+0x60] 7060 mova m3, [rsp+0x70] 7061 mova m15, [rsp+0x80] 7062 punpcklwd m4, m5, m6 ; 45a 7063 punpckhwd m5, m6 ; 45b 7064 punpcklwd m6, m7, m8 ; 67a 7065 punpckhwd m7, m8 ; 67b 7066 SWAP m14, m8 7067 mova m8, [rsp+0x140] 7068 mova m9, [rsp+0x150] 7069 mova m10, [rsp+0x160] 7070 mova m11, [rsp+0x170] 7071 punpcklwd m0, m1, m2 ; 01a 7072 punpckhwd m1, m2 ; 01b 7073 punpcklwd m2, m3, m15; 23a 7074 punpckhwd m3, m15 ; 23b 7075 mova [rsp+0x50], m4 7076 mova [rsp+0x60], m5 7077 mova [rsp+0x70], m6 7078 mova [rsp+0x80], m7 7079%else 7080 movd r0, m15 7081 movd rX, m4 7082 psrldq m15, 4 7083 psrldq m4, 4 7084 movd r4, m15 7085 movd r5, m4 7086 mova m14, [esp+0x110] 7087 movq m2, [base+subpel_filters+r0*8] 7088 movq m3, [base+subpel_filters+rX*8] 7089 movhps m2, [base+subpel_filters+r4*8] 7090 movhps m3, [base+subpel_filters+r5*8] 7091 psrld m14, 10 7092 mova [esp+16], m14 7093 mov r0, [esp+ 0] 7094 mov rX, [esp+ 8] 7095 mov r4, [esp+ 4] 7096 mov r5, [esp+12] 7097 mova [esp+0x20], m0 7098 mova [esp+0x30], m1 7099 mova [esp+0x40], m2 7100 mova [esp+0x50], m3 7101 pshufd m4, m6, q1100 7102 pshufd m6, m6, q3322 7103 pshufd m7, m5, q1100 7104 pshufd m5, m5, q3322 7105 pand m0, m11, m4 7106 pand m1, m11, m6 7107 pand m2, m11, m7 7108 pand m3, m11, m5 7109 pandn m4, [esp+0x20] 7110 pandn m6, [esp+0x30] 7111 pandn m7, [esp+0x40] 7112 pandn m5, [esp+0x50] 7113 por m0, m4 7114 por m1, m6 7115 por m2, m7 7116 por m3, m5 7117 mova [esp+0x20], m0 7118 mova [esp+0x30], m1 7119 mova [esp+0x40], m2 7120 mova [esp+0x50], m3 7121 MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1 7122 MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3 7123 MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5 7124 MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7 7125 mova m5, [esp+0x1a0] 7126 mova m6, [esp+0x1b0] 7127 mova m7, [esp+0x1c0] 7128 mova m0, [esp+0x1d0] 7129 punpcklwd m4, m5, m6 ; 45a 7130 punpckhwd m5, m6 ; 45b 7131 punpcklwd m6, m7, m0 ; 67a 7132 punpckhwd m7, m0 ; 67b 7133 mova [esp+0x1a0], m4 7134 mova [esp+0x1b0], m5 7135 mova [esp+0x1c0], m6 7136 mova [esp+0x1d0], m7 7137 mova m1, [esp+0x060] 7138 mova m2, [esp+0x070] 7139 mova m3, [esp+0x180] 7140 mova m4, [esp+0x190] 7141 punpcklwd m0, m1, m2 ; 01a 7142 punpckhwd m1, m2 ; 01b 7143 punpcklwd m2, m3, m4 ; 23a 7144 punpckhwd m3, m4 ; 23b 7145 mova [esp+0x180], m2 7146 mova [esp+0x190], m3 7147 %define m8 [esp+0x140] 7148 %define m9 [esp+0x150] 7149 %define m10 [esp+0x160] 7150 %define m11 [esp+0x170] 7151%endif 7152.dy2_vloop: 7153%if ARCH_X86_32 7154 mov r0, r0m 7155%endif 7156 pmaddwd m4, m0, m8 7157 pmaddwd m5, m1, m8 7158 pmaddwd m6, m2, m9 7159 pmaddwd m7, m3, m9 7160 paddd m4, m13 7161 paddd m5, m13 7162 paddd m4, m6 7163 paddd m5, m7 7164%if ARCH_X86_64 7165 pmaddwd m6, [rsp+0x50], m10 7166 pmaddwd m7, [rsp+0x60], m10 7167%else 7168 pmaddwd m6, [esp+0x1a0], m10 7169 pmaddwd m7, [esp+0x1b0], m10 7170%endif 7171 paddd m4, m6 7172 paddd m5, m7 7173%if ARCH_X86_64 7174 pmaddwd m6, [rsp+0x70], m11 7175 pmaddwd m7, [rsp+0x80], m11 7176%else 7177 pmaddwd m6, [esp+0x1c0], m11 7178 pmaddwd m7, [esp+0x1d0], m11 7179%endif 7180 paddd m4, m6 7181 paddd m5, m7 7182 psrad m4, rndshift 7183 psrad m5, rndshift 7184 packssdw m4, m5 7185%ifidn %1, put 7186 packuswb m4, m4 7187 movq [dstq], m4 7188 add dstq, dsm 7189%else 7190 mova [tmpq], m4 7191 add tmpq, tmp_stridem 7192%endif 7193%if ARCH_X86_32 7194 mov r0m, r0 7195%endif 7196 dec hd 7197 jz .dy2_hloop_prep 7198%if ARCH_X86_64 7199 mova m8, [rsp+0x10] 7200 mova m9, [rsp+0x20] 7201 mova m10, [rsp+0x30] 7202 mova m11, [rsp+0x40] 7203 mova m0, m2 ; 01a 7204 mova m1, m3 ; 01b 7205 MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11 7206 mova m3, [rsp+0x50] ; 23a 7207 mova m4, [rsp+0x60] ; 23b 7208 mova m5, [rsp+0x70] ; 45a 7209 mova m7, [rsp+0x80] ; 45b 7210 mova m8, [rsp+0x140] 7211 mova m9, [rsp+0x150] 7212 mova m10, [rsp+0x160] 7213 mova m11, [rsp+0x170] 7214 punpcklwd m14, m2, m6 ; 67a 7215 punpckhwd m2, m6 ; 67b 7216 mova [rsp+0x50], m5 7217 mova [rsp+0x60], m7 7218 mova [rsp+0x70], m14 7219 mova [rsp+0x80], m2 7220 mova m2, m3 7221 mova m3, m4 7222%else 7223 MC_8TAP_SCALED_H 0x20, 0 7224 punpcklwd m6, m0, m4 7225 punpckhwd m7, m0, m4 7226 mova m0, [esp+0x180] ; 01a 7227 mova m1, [esp+0x190] ; 01b 7228 mova m2, [rsp+0x1a0] ; 23a 7229 mova m3, [esp+0x1b0] ; 23b 7230 mova m4, [esp+0x1c0] ; 45a 7231 mova m5, [esp+0x1d0] ; 45b 7232 mova [esp+0x180], m2 7233 mova [esp+0x190], m3 7234 mova [esp+0x1a0], m4 7235 mova [esp+0x1b0], m5 7236 mova [esp+0x1c0], m6 ; 67a 7237 mova [esp+0x1d0], m7 ; 67b 7238%endif 7239 jmp .dy2_vloop 7240.ret: 7241 MC_8TAP_SCALED_RET 0 7242%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT 7243 %define r0m [rstk+stack_offset+ 4] 7244 %define r1m [rstk+stack_offset+ 8] 7245 %define r2m [rstk+stack_offset+12] 7246 %define r3m [rstk+stack_offset+16] 7247%endif 7248%undef isprep 7249%endmacro 7250 7251%macro BILIN_SCALED_FN 1 7252cglobal %1_bilin_scaled_8bpc 7253 mov t0d, (5*15 << 16) | 5*15 7254 mov t1d, (5*15 << 16) | 5*15 7255 jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX) 7256%endmacro 7257 7258%if WIN64 7259DECLARE_REG_TMP 6, 5 7260%elif ARCH_X86_64 7261DECLARE_REG_TMP 6, 8 7262%else 7263DECLARE_REG_TMP 1, 2 7264%endif 7265BILIN_SCALED_FN put 7266FN put_8tap_scaled, sharp, SHARP, SHARP 7267FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH 7268FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP 7269FN put_8tap_scaled, smooth, SMOOTH, SMOOTH 7270FN put_8tap_scaled, sharp_regular, SHARP, REGULAR 7271FN put_8tap_scaled, regular_sharp, REGULAR, SHARP 7272FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR 7273FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH 7274FN put_8tap_scaled, regular, REGULAR, REGULAR 7275MC_8TAP_SCALED put 7276 7277%if WIN64 7278DECLARE_REG_TMP 5, 4 7279%elif ARCH_X86_64 7280DECLARE_REG_TMP 6, 7 7281%else 7282DECLARE_REG_TMP 1, 2 7283%endif 7284BILIN_SCALED_FN prep 7285FN prep_8tap_scaled, sharp, SHARP, SHARP 7286FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH 7287FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP 7288FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH 7289FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR 7290FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP 7291FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR 7292FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH 7293FN prep_8tap_scaled, regular, REGULAR, REGULAR 7294MC_8TAP_SCALED prep 7295 7296%if ARCH_X86_32 7297 %macro SAVE_ALPHA_BETA 0 7298 mov alpham, alphad 7299 mov betam, betad 7300 %endmacro 7301 7302 %macro SAVE_DELTA_GAMMA 0 7303 mov deltam, deltad 7304 mov gammam, gammad 7305 %endmacro 7306 7307 %macro LOAD_ALPHA_BETA_MX 0 7308 mov mym, myd 7309 mov alphad, alpham 7310 mov betad, betam 7311 mov mxd, mxm 7312 %endmacro 7313 7314 %macro LOAD_DELTA_GAMMA_MY 0 7315 mov mxm, mxd 7316 mov deltad, deltam 7317 mov gammad, gammam 7318 mov myd, mym 7319 %endmacro 7320 7321 %define PIC_reg r2 7322 %define PIC_base_offset $$ 7323 %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) 7324%else 7325 %define SAVE_ALPHA_BETA 7326 %define SAVE_DELTA_GAMMA 7327 %define PIC_sym(sym) sym 7328%endif 7329 7330%if ARCH_X86_32 7331 %if STACK_ALIGNMENT < required_stack_alignment 7332 %assign copy_args 8*4 7333 %else 7334 %assign copy_args 0 7335 %endif 7336%endif 7337 7338%macro RELOC_ARGS 0 7339 %if copy_args 7340 mov r0, r0m 7341 mov r1, r1m 7342 mov r2, r2m 7343 mov r3, r3m 7344 mov r5, r5m 7345 mov dstm, r0 7346 mov dsm, r1 7347 mov srcm, r2 7348 mov ssm, r3 7349 mov mxm, r5 7350 mov r0, r6m 7351 mov mym, r0 7352 %endif 7353%endmacro 7354 7355%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2 7356 %if cpuflag(sse4) 7357 pblendw %1, %2, 0xAA 7358 %else 7359 pand %2, m10 7360 por %1, %2 7361 %endif 7362%endmacro 7363 7364%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7 7365 %if ARCH_X86_32 7366 %define m8 m4 7367 %define m9 m5 7368 %define m14 m6 7369 %define m15 m7 7370 %define m11 m7 7371 %endif 7372 %if notcpuflag(ssse3) || ARCH_X86_32 7373 pxor m11, m11 7374 %endif 7375 lea tmp1d, [myq+deltaq*4] 7376 lea tmp2d, [myq+deltaq*1] 7377 shr myd, 10 7378 shr tmp1d, 10 7379 movq m2, [filterq+myq *8] ; a 7380 movq m8, [filterq+tmp1q*8] ; e 7381 lea tmp1d, [tmp2q+deltaq*4] 7382 lea myd, [tmp2q+deltaq*1] 7383 shr tmp2d, 10 7384 shr tmp1d, 10 7385 movq m3, [filterq+tmp2q*8] ; b 7386 movq m0, [filterq+tmp1q*8] ; f 7387 punpcklwd m2, m3 7388 punpcklwd m8, m0 7389 lea tmp1d, [myq+deltaq*4] 7390 lea tmp2d, [myq+deltaq*1] 7391 shr myd, 10 7392 shr tmp1d, 10 7393 movq m0, [filterq+myq *8] ; c 7394 movq m9, [filterq+tmp1q*8] ; g 7395 lea tmp1d, [tmp2q+deltaq*4] 7396 lea myd, [tmp2q+gammaq] ; my += gamma 7397 shr tmp2d, 10 7398 shr tmp1d, 10 7399 movq m3, [filterq+tmp2q*8] ; d 7400 movq m1, [filterq+tmp1q*8] ; h 7401 punpcklwd m0, m3 7402 punpcklwd m9, m1 7403 punpckldq m1, m2, m0 7404 punpckhdq m2, m0 7405 punpcklbw m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 7406 punpckhbw m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 7407 punpcklbw m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 7408 punpckhbw m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 7409 pmaddwd m0, %3 7410 pmaddwd m3, %5 7411 pmaddwd m1, %7 7412 pmaddwd m14, %9 7413 paddd m0, m3 7414 paddd m1, m14 7415 paddd m0, m1 7416 mova %1, m0 7417 %if ARCH_X86_64 7418 SWAP m3, m14 7419 %endif 7420 punpckldq m0, m8, m9 7421 punpckhdq m8, m9 7422 punpcklbw m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8 7423 punpckhbw m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8 7424 punpcklbw m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8 7425 punpckhbw m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8 7426 pmaddwd m1, %4 7427 pmaddwd m14, %6 7428 pmaddwd m2, %8 7429 pmaddwd m15, %10 7430 paddd m1, m14 7431 paddd m2, m15 7432 paddd m1, m2 7433 mova %2, m1 7434 %if ARCH_X86_64 7435 SWAP m14, m3 7436 %endif 7437%endmacro 7438 7439%if ARCH_X86_64 7440 %define counterd r4d 7441%else 7442 %if copy_args == 0 7443 %define counterd dword r4m 7444 %else 7445 %define counterd dword [esp+stack_size-4*7] 7446 %endif 7447%endif 7448 7449%macro WARP_AFFINE_8X8T 0 7450%if ARCH_X86_64 7451cglobal warp_affine_8x8t_8bpc, 6, 14, 16, 0x90, tmp, ts 7452%else 7453cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts 7454 %if copy_args 7455 %define tmpm [esp+stack_size-4*1] 7456 %define tsm [esp+stack_size-4*2] 7457 %endif 7458%endif 7459 call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main 7460.loop: 7461%if ARCH_X86_32 7462 %define m12 m4 7463 %define m13 m5 7464 %define m14 m6 7465 %define m15 m7 7466 mova m12, [esp+0xC0] 7467 mova m13, [esp+0xD0] 7468 mova m14, [esp+0xE0] 7469 mova m15, [esp+0xF0] 7470%endif 7471%if cpuflag(ssse3) 7472 psrad m12, 13 7473 psrad m13, 13 7474 psrad m14, 13 7475 psrad m15, 13 7476 packssdw m12, m13 7477 packssdw m14, m15 7478 mova m13, [PIC_sym(pw_8192)] 7479 pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7 7480 pmulhrsw m14, m13 7481%else 7482 %if ARCH_X86_32 7483 %define m10 m0 7484 %endif 7485 mova m10, [PIC_sym(pd_16384)] 7486 paddd m12, m10 7487 paddd m13, m10 7488 paddd m14, m10 7489 paddd m15, m10 7490 psrad m12, 15 7491 psrad m13, 15 7492 psrad m14, 15 7493 psrad m15, 15 7494 packssdw m12, m13 7495 packssdw m14, m15 7496%endif 7497 mova [tmpq+tsq*0], m12 7498 mova [tmpq+tsq*2], m14 7499 dec counterd 7500 jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).end 7501%if ARCH_X86_32 7502 mov tmpm, tmpd 7503 mov r0, [esp+0x100] 7504 mov r1, [esp+0x104] 7505%endif 7506 call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main2 7507 lea tmpq, [tmpq+tsq*4] 7508 jmp .loop 7509%endmacro 7510 7511%macro WARP_AFFINE_8X8 0 7512%if ARCH_X86_64 7513cglobal warp_affine_8x8_8bpc, 6, 14, 16, 0x90, \ 7514 dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ 7515 filter, tmp1, delta, my, gamma 7516%else 7517cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \ 7518 dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ 7519 filter, tmp1, delta, my, gamma 7520 %define alphaq r0 7521 %define alphad r0 7522 %define alpham [esp+gprsize+0x100] 7523 %define betaq r1 7524 %define betad r1 7525 %define betam [esp+gprsize+0x104] 7526 %define deltaq r0 7527 %define deltad r0 7528 %define deltam [esp+gprsize+0x108] 7529 %define gammaq r1 7530 %define gammad r1 7531 %define gammam [esp+gprsize+0x10C] 7532 %define filterq r3 7533 %define tmp1q r4 7534 %define tmp1d r4 7535 %define tmp1m [esp+gprsize+0x110] 7536 %define myq r5 7537 %define myd r5 7538 %define mym r6m 7539 %if copy_args 7540 %define dstm [esp+stack_size-4*1] 7541 %define dsm [esp+stack_size-4*2] 7542 %define srcm [esp+stack_size-4*3] 7543 %define ssm [esp+stack_size-4*4] 7544 %define mxm [esp+stack_size-4*5] 7545 %define mym [esp+stack_size-4*6] 7546 %endif 7547%endif 7548 call .main 7549 jmp .start 7550.loop: 7551%if ARCH_X86_32 7552 mov dstm, dstd 7553 mov alphad, [esp+0x100] 7554 mov betad, [esp+0x104] 7555%endif 7556 call .main2 7557 lea dstq, [dstq+dsq*2] 7558.start: 7559%if notcpuflag(sse4) 7560 %if cpuflag(ssse3) 7561 %define roundval pw_8192 7562 %else 7563 %define roundval pd_262144 7564 %endif 7565 %if ARCH_X86_64 7566 mova m10, [PIC_sym(roundval)] 7567 %else 7568 %define m10 [PIC_sym(roundval)] 7569 %endif 7570%endif 7571%if ARCH_X86_32 7572 %define m12 m5 7573 %define m13 m6 7574 mova m12, [esp+0xC0] 7575 mova m13, [esp+0xD0] 7576%endif 7577%if cpuflag(sse4) 7578 %if ARCH_X86_32 7579 %define m11 m4 7580 pxor m11, m11 7581 %endif 7582 psrad m12, 18 7583 psrad m13, 18 7584 packusdw m12, m13 7585 pavgw m12, m11 ; (x + (1 << 10)) >> 11 7586%else 7587 %if cpuflag(ssse3) 7588 psrad m12, 17 7589 psrad m13, 17 7590 packssdw m12, m13 7591 pmulhrsw m12, m10 7592 %else 7593 paddd m12, m10 7594 paddd m13, m10 7595 psrad m12, 19 7596 psrad m13, 19 7597 packssdw m12, m13 7598 %endif 7599%endif 7600%if ARCH_X86_32 7601 %define m14 m6 7602 %define m15 m7 7603 mova m14, [esp+0xE0] 7604 mova m15, [esp+0xF0] 7605%endif 7606%if cpuflag(sse4) 7607 psrad m14, 18 7608 psrad m15, 18 7609 packusdw m14, m15 7610 pavgw m14, m11 ; (x + (1 << 10)) >> 11 7611%else 7612 %if cpuflag(ssse3) 7613 psrad m14, 17 7614 psrad m15, 17 7615 packssdw m14, m15 7616 pmulhrsw m14, m10 7617 %else 7618 paddd m14, m10 7619 paddd m15, m10 7620 psrad m14, 19 7621 psrad m15, 19 7622 packssdw m14, m15 7623 %endif 7624%endif 7625 packuswb m12, m14 7626 movq [dstq+dsq*0], m12 7627 movhps [dstq+dsq*1], m12 7628 dec counterd 7629 jg .loop 7630.end: 7631 RET 7632ALIGN function_align 7633.main: 7634%assign stack_offset stack_offset+gprsize 7635%if ARCH_X86_32 7636 %assign stack_size stack_size+4 7637 %if copy_args 7638 %assign stack_offset stack_offset-4 7639 %endif 7640 RELOC_ARGS 7641 LEA PIC_reg, $$ 7642 %define PIC_mem [esp+gprsize+0x114] 7643 mov abcdd, abcdm 7644 %if copy_args == 0 7645 mov ssd, ssm 7646 mov mxd, mxm 7647 %endif 7648 mov PIC_mem, PIC_reg 7649 mov srcd, srcm 7650%endif 7651 movsx deltad, word [abcdq+2*2] 7652 movsx gammad, word [abcdq+2*3] 7653 lea tmp1d, [deltaq*3] 7654 sub gammad, tmp1d ; gamma -= delta*3 7655 SAVE_DELTA_GAMMA 7656%if ARCH_X86_32 7657 mov abcdd, abcdm 7658%endif 7659 movsx alphad, word [abcdq+2*0] 7660 movsx betad, word [abcdq+2*1] 7661 lea tmp1q, [ssq*3+3] 7662 add mxd, 512+(64<<10) 7663 lea tmp2d, [alphaq*3] 7664 sub srcq, tmp1q ; src -= src_stride*3 + 3 7665%if ARCH_X86_32 7666 mov srcm, srcd 7667 mov PIC_reg, PIC_mem 7668%endif 7669 sub betad, tmp2d ; beta -= alpha*3 7670 lea filterq, [PIC_sym(mc_warp_filter2)] 7671%if ARCH_X86_64 7672 mov myd, r6m 7673 %if cpuflag(ssse3) 7674 pxor m11, m11 7675 %endif 7676%endif 7677 call .h 7678 psrld m2, m0, 16 7679 psrld m3, m1, 16 7680%if ARCH_X86_32 7681 %if notcpuflag(ssse3) 7682 mova [esp+gprsize+0x00], m2 7683 %endif 7684 mova [esp+gprsize+0x10], m3 7685%endif 7686 call .h 7687 psrld m4, m0, 16 7688 psrld m5, m1, 16 7689%if ARCH_X86_32 7690 mova [esp+gprsize+0x20], m4 7691 mova [esp+gprsize+0x30], m5 7692%endif 7693 call .h 7694%if ARCH_X86_64 7695 %define blendmask [rsp+gprsize+0x80] 7696%else 7697 %if notcpuflag(ssse3) 7698 mova m2, [esp+gprsize+0x00] 7699 %endif 7700 mova m3, [esp+gprsize+0x10] 7701 %define blendmask [esp+gprsize+0x120] 7702 %define m10 m7 7703%endif 7704 pcmpeqd m10, m10 7705 pslld m10, 16 7706 mova blendmask, m10 7707 BLENDHWDW m2, m0 ; 0 7708 BLENDHWDW m3, m1 ; 2 7709 mova [rsp+gprsize+0x00], m2 7710 mova [rsp+gprsize+0x10], m3 7711 call .h 7712%if ARCH_X86_32 7713 mova m4, [esp+gprsize+0x20] 7714 mova m5, [esp+gprsize+0x30] 7715%endif 7716 mova m10, blendmask 7717 BLENDHWDW m4, m0 ; 1 7718 BLENDHWDW m5, m1 ; 3 7719 mova [rsp+gprsize+0x20], m4 7720 mova [rsp+gprsize+0x30], m5 7721 call .h 7722%if ARCH_X86_32 7723 %if notcpuflag(ssse3) 7724 mova m2, [esp+gprsize+0x00] 7725 %endif 7726 mova m3, [esp+gprsize+0x10] 7727 %define m10 m5 7728%endif 7729 psrld m6, m2, 16 7730 psrld m7, m3, 16 7731 mova m10, blendmask 7732 BLENDHWDW m6, m0 ; 2 7733 BLENDHWDW m7, m1 ; 4 7734 mova [rsp+gprsize+0x40], m6 7735 mova [rsp+gprsize+0x50], m7 7736 call .h 7737%if ARCH_X86_32 7738 mova m4, [esp+gprsize+0x20] 7739 mova m5, [esp+gprsize+0x30] 7740%endif 7741 psrld m2, m4, 16 7742 psrld m3, m5, 16 7743 mova m10, blendmask 7744 BLENDHWDW m2, m0 ; 3 7745 BLENDHWDW m3, m1 ; 5 7746 mova [rsp+gprsize+0x60], m2 7747 mova [rsp+gprsize+0x70], m3 7748 call .h 7749%if ARCH_X86_32 7750 mova m6, [esp+gprsize+0x40] 7751 mova m7, [esp+gprsize+0x50] 7752 %define m10 m7 7753%endif 7754 psrld m4, m6, 16 7755 psrld m5, m7, 16 7756 mova m10, blendmask 7757 BLENDHWDW m4, m0 ; 4 7758 BLENDHWDW m5, m1 ; 6 7759%if ARCH_X86_64 7760 add myd, 512+(64<<10) 7761 mova m6, m2 7762 mova m7, m3 7763%else 7764 mova [esp+gprsize+0x80], m4 7765 mova [esp+gprsize+0x90], m5 7766 add dword mym, 512+(64<<10) 7767%endif 7768 mov counterd, 4 7769 SAVE_ALPHA_BETA 7770.main2: 7771 call .h 7772%if ARCH_X86_32 7773 mova m6, [esp+gprsize+0x60] 7774 mova m7, [esp+gprsize+0x70] 7775 %define m10 m5 7776%endif 7777 psrld m6, 16 7778 psrld m7, 16 7779 mova m10, blendmask 7780 BLENDHWDW m6, m0 ; 5 7781 BLENDHWDW m7, m1 ; 7 7782%if ARCH_X86_64 7783 WARP_V m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \ 7784 m4, m5, \ 7785 [rsp+gprsize+0x20], [rsp+gprsize+0x30], \ 7786 m6, m7 7787%else 7788 mova [esp+gprsize+0xA0], m6 7789 mova [esp+gprsize+0xB0], m7 7790 LOAD_DELTA_GAMMA_MY 7791 WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \ 7792 [esp+gprsize+0x00], [esp+gprsize+0x10], \ 7793 [esp+gprsize+0x80], [esp+gprsize+0x90], \ 7794 [esp+gprsize+0x20], [esp+gprsize+0x30], \ 7795 [esp+gprsize+0xA0], [esp+gprsize+0xB0] 7796 LOAD_ALPHA_BETA_MX 7797%endif 7798 call .h 7799 mova m2, [rsp+gprsize+0x40] 7800 mova m3, [rsp+gprsize+0x50] 7801%if ARCH_X86_32 7802 mova m4, [rsp+gprsize+0x80] 7803 mova m5, [rsp+gprsize+0x90] 7804 %define m10 m7 7805%endif 7806 mova [rsp+gprsize+0x00], m2 7807 mova [rsp+gprsize+0x10], m3 7808 mova [rsp+gprsize+0x40], m4 7809 mova [rsp+gprsize+0x50], m5 7810 psrld m4, 16 7811 psrld m5, 16 7812 mova m10, blendmask 7813 BLENDHWDW m4, m0 ; 6 7814 BLENDHWDW m5, m1 ; 8 7815%if ARCH_X86_64 7816 WARP_V m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \ 7817 m6, m7, \ 7818 [rsp+gprsize+0x00], [rsp+gprsize+0x10], \ 7819 m4, m5 7820%else 7821 mova [esp+gprsize+0x80], m4 7822 mova [esp+gprsize+0x90], m5 7823 LOAD_DELTA_GAMMA_MY 7824 WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \ 7825 [esp+gprsize+0x20], [esp+gprsize+0x30], \ 7826 [esp+gprsize+0xA0], [esp+gprsize+0xB0], \ 7827 [esp+gprsize+0x00], [esp+gprsize+0x10], \ 7828 [esp+gprsize+0x80], [esp+gprsize+0x90] 7829 mov mym, myd 7830 mov dstd, dstm 7831 mov dsd, dsm 7832 mov mxd, mxm 7833%endif 7834 mova m2, [rsp+gprsize+0x60] 7835 mova m3, [rsp+gprsize+0x70] 7836%if ARCH_X86_32 7837 mova m6, [esp+gprsize+0xA0] 7838 mova m7, [esp+gprsize+0xB0] 7839%endif 7840 mova [rsp+gprsize+0x20], m2 7841 mova [rsp+gprsize+0x30], m3 7842 mova [rsp+gprsize+0x60], m6 7843 mova [rsp+gprsize+0x70], m7 7844 ret 7845ALIGN function_align 7846.h: 7847%if ARCH_X86_32 7848 %define m8 m3 7849 %define m9 m4 7850 %define m10 m5 7851 %define m14 m6 7852 %define m15 m7 7853%endif 7854 lea tmp1d, [mxq+alphaq*4] 7855 lea tmp2d, [mxq+alphaq*1] 7856%if ARCH_X86_32 7857 %assign stack_offset stack_offset+4 7858 %assign stack_size stack_size+4 7859 %define PIC_mem [esp+gprsize*2+0x114] 7860 mov PIC_mem, PIC_reg 7861 mov srcd, srcm 7862%endif 7863 movu m10, [srcq] 7864%if ARCH_X86_32 7865 add srcd, ssm 7866 mov srcm, srcd 7867 mov PIC_reg, PIC_mem 7868%else 7869 add srcq, ssq 7870%endif 7871 shr mxd, 10 7872 shr tmp1d, 10 7873 movq m1, [filterq+mxq *8] ; 0 X 7874 movq m8, [filterq+tmp1q*8] ; 4 X 7875 lea tmp1d, [tmp2q+alphaq*4] 7876 lea mxd, [tmp2q+alphaq*1] 7877 shr tmp2d, 10 7878 shr tmp1d, 10 7879 movhps m1, [filterq+tmp2q*8] ; 0 1 7880 movhps m8, [filterq+tmp1q*8] ; 4 5 7881 lea tmp1d, [mxq+alphaq*4] 7882 lea tmp2d, [mxq+alphaq*1] 7883 shr mxd, 10 7884 shr tmp1d, 10 7885%if cpuflag(ssse3) 7886 movq m14, [filterq+mxq *8] ; 2 X 7887 movq m9, [filterq+tmp1q*8] ; 6 X 7888 lea tmp1d, [tmp2q+alphaq*4] 7889 lea mxd, [tmp2q+betaq] ; mx += beta 7890 shr tmp2d, 10 7891 shr tmp1d, 10 7892 movhps m14, [filterq+tmp2q*8] ; 2 3 7893 movhps m9, [filterq+tmp1q*8] ; 6 7 7894 pshufb m0, m10, [PIC_sym(warp_8x8_shufA)] 7895 pmaddubsw m0, m1 7896 pshufb m1, m10, [PIC_sym(warp_8x8_shufB)] 7897 pmaddubsw m1, m8 7898 pshufb m15, m10, [PIC_sym(warp_8x8_shufC)] 7899 pmaddubsw m15, m14 7900 pshufb m10, m10, [PIC_sym(warp_8x8_shufD)] 7901 pmaddubsw m10, m9 7902 phaddw m0, m15 7903 phaddw m1, m10 7904%else 7905 %if ARCH_X86_32 7906 %define m11 m2 7907 %endif 7908 pcmpeqw m0, m0 7909 psrlw m14, m0, 8 7910 psrlw m15, m10, 8 ; 01 03 05 07 09 11 13 15 7911 pand m14, m10 ; 00 02 04 06 08 10 12 14 7912 packuswb m14, m15 ; 00 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 7913 psrldq m9, m0, 4 7914 pshufd m0, m14, q0220 7915 pand m0, m9 7916 psrldq m14, 1 ; 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ 7917 pslldq m15, m14, 12 7918 por m0, m15 ; shufA 7919 psrlw m15, m0, 8 7920 psraw m11, m1, 8 7921 psllw m0, 8 7922 psllw m1, 8 7923 psrlw m0, 8 7924 psraw m1, 8 7925 pmullw m15, m11 7926 pmullw m0, m1 7927 paddw m0, m15 ; pmaddubsw m0, m1 7928 pshufd m15, m14, q0220 7929 pand m15, m9 7930 psrldq m14, 1 ; 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ 7931 pslldq m1, m14, 12 7932 por m15, m1 ; shufC 7933 pshufd m1, m14, q0220 7934 pand m1, m9 7935 psrldq m14, 1 ; 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ 7936 pslldq m11, m14, 12 7937 por m1, m11 ; shufB 7938 pshufd m10, m14, q0220 7939 pand m10, m9 7940 psrldq m14, 1 ; 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ __ 7941 pslldq m14, m14, 12 7942 por m10, m14 ; shufD 7943 psrlw m9, m1, 8 7944 psraw m11, m8, 8 7945 psllw m1, 8 7946 psllw m8, 8 7947 psrlw m1, 8 7948 psraw m8, 8 7949 pmullw m9, m11 7950 pmullw m1, m8 7951 paddw m1, m9 ; pmaddubsw m1, m8 7952 movq m14, [filterq+mxq *8] ; 2 X 7953 movq m9, [filterq+tmp1q*8] ; 6 X 7954 lea tmp1d, [tmp2q+alphaq*4] 7955 lea mxd, [tmp2q+betaq] ; mx += beta 7956 shr tmp2d, 10 7957 shr tmp1d, 10 7958 movhps m14, [filterq+tmp2q*8] ; 2 3 7959 movhps m9, [filterq+tmp1q*8] ; 6 7 7960 psrlw m8, m15, 8 7961 psraw m11, m14, 8 7962 psllw m15, 8 7963 psllw m14, 8 7964 psrlw m15, 8 7965 psraw m14, 8 7966 pmullw m8, m11 7967 pmullw m15, m14 7968 paddw m15, m8 ; pmaddubsw m15, m14 7969 psrlw m8, m10, 8 7970 psraw m11, m9, 8 7971 psllw m10, 8 7972 psllw m9, 8 7973 psrlw m10, 8 7974 psraw m9, 8 7975 pmullw m8, m11 7976 pmullw m10, m9 7977 paddw m10, m8 ; pmaddubsw m10, m9 7978 pslld m8, m0, 16 7979 pslld m9, m1, 16 7980 pslld m14, m15, 16 7981 pslld m11, m10, 16 7982 paddw m0, m8 7983 paddw m1, m9 7984 paddw m15, m14 7985 paddw m10, m11 7986 psrad m0, 16 7987 psrad m1, 16 7988 psrad m15, 16 7989 psrad m10, 16 7990 packssdw m0, m15 ; phaddw m0, m15 7991 packssdw m1, m10 ; phaddw m1, m10 7992%endif 7993 mova m14, [PIC_sym(pw_8192)] 7994 mova m9, [PIC_sym(pd_32768)] 7995 pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 7996 pmaddwd m1, m14 7997 paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword 7998 paddd m1, m9 7999 ret 8000%endmacro 8001 8002%if WIN64 8003DECLARE_REG_TMP 6, 4 8004%else 8005DECLARE_REG_TMP 6, 7 8006%endif 8007 8008%macro BIDIR_FN 1 ; op 8009 %1 0 8010 lea stride3q, [strideq*3] 8011 jmp wq 8012.w4_loop: 8013 %1_INC_PTR 2 8014 %1 0 8015 lea dstq, [dstq+strideq*4] 8016.w4: ; tile 4x 8017 movd [dstq ], m0 ; copy dw[0] 8018 pshuflw m1, m0, q1032 ; swap dw[1] and dw[0] 8019 movd [dstq+strideq*1], m1 ; copy dw[1] 8020 punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0] 8021 movd [dstq+strideq*2], m0 ; dw[2] 8022 psrlq m0, 32 ; shift right in dw[3] 8023 movd [dstq+stride3q ], m0 ; copy 8024 sub hd, 4 8025 jg .w4_loop 8026 RET 8027.w8_loop: 8028 %1_INC_PTR 2 8029 %1 0 8030 lea dstq, [dstq+strideq*2] 8031.w8: 8032 movq [dstq ], m0 8033 movhps [dstq+strideq*1], m0 8034 sub hd, 2 8035 jg .w8_loop 8036 RET 8037.w16_loop: 8038 %1_INC_PTR 2 8039 %1 0 8040 lea dstq, [dstq+strideq] 8041.w16: 8042 mova [dstq ], m0 8043 dec hd 8044 jg .w16_loop 8045 RET 8046.w32_loop: 8047 %1_INC_PTR 4 8048 %1 0 8049 lea dstq, [dstq+strideq] 8050.w32: 8051 mova [dstq ], m0 8052 %1 2 8053 mova [dstq + 16 ], m0 8054 dec hd 8055 jg .w32_loop 8056 RET 8057.w64_loop: 8058 %1_INC_PTR 8 8059 %1 0 8060 add dstq, strideq 8061.w64: 8062 %assign i 0 8063 %rep 4 8064 mova [dstq + i*16 ], m0 8065 %assign i i+1 8066 %if i < 4 8067 %1 2*i 8068 %endif 8069 %endrep 8070 dec hd 8071 jg .w64_loop 8072 RET 8073.w128_loop: 8074 %1_INC_PTR 16 8075 %1 0 8076 add dstq, strideq 8077.w128: 8078 %assign i 0 8079 %rep 8 8080 mova [dstq + i*16 ], m0 8081 %assign i i+1 8082 %if i < 8 8083 %1 2*i 8084 %endif 8085 %endrep 8086 dec hd 8087 jg .w128_loop 8088 RET 8089%endmacro 8090 8091%macro AVG 1 ; src_offset 8092 ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel 8093 mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1 8094 paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2 8095 mova m1, [tmp1q+(%1+1)*mmsize] 8096 paddw m1, [tmp2q+(%1+1)*mmsize] 8097 pmulhrsw m0, m2 8098 pmulhrsw m1, m2 8099 packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit 8100%endmacro 8101 8102%macro AVG_INC_PTR 1 8103 add tmp1q, %1*mmsize 8104 add tmp2q, %1*mmsize 8105%endmacro 8106 8107cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 8108 LEA r6, avg_ssse3_table 8109 tzcnt wd, wm ; leading zeros 8110 movifnidn hd, hm ; move h(stack) to h(register) if not already that register 8111 movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg 8112 mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align 8113 add wq, r6 8114 BIDIR_FN AVG 8115 8116%macro W_AVG 1 ; src_offset 8117 ; (a * weight + b * (16 - weight) + 128) >> 8 8118 ; = ((a - b) * weight + (b << 4) + 128) >> 8 8119 ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 8120 ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 8121 mova m2, [tmp1q+(%1+0)*mmsize] 8122 mova m0, m2 8123 psubw m2, [tmp2q+(%1+0)*mmsize] 8124 mova m3, [tmp1q+(%1+1)*mmsize] 8125 mova m1, m3 8126 psubw m3, [tmp2q+(%1+1)*mmsize] 8127 pmulhw m2, m4 8128 pmulhw m3, m4 8129 paddw m0, m2 8130 paddw m1, m3 8131 pmulhrsw m0, m5 8132 pmulhrsw m1, m5 8133 packuswb m0, m1 8134%endmacro 8135 8136%define W_AVG_INC_PTR AVG_INC_PTR 8137 8138cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 8139 LEA r6, w_avg_ssse3_table 8140 tzcnt wd, wm 8141 movd m4, r6m 8142 movifnidn hd, hm 8143 pxor m0, m0 8144 movsxd wq, dword [r6+wq*4] 8145 mova m5, [pw_2048+r6-w_avg_ssse3_table] 8146 pshufb m4, m0 8147 psllw m4, 12 ; (weight-16) << 12 when interpreted as signed 8148 add wq, r6 8149 cmp dword r6m, 7 8150 jg .weight_gt7 8151 mov r6, tmp1q 8152 psubw m0, m4 8153 mov tmp1q, tmp2q 8154 mova m4, m0 ; -weight 8155 mov tmp2q, r6 8156.weight_gt7: 8157 BIDIR_FN W_AVG 8158 8159%macro MASK 1 ; src_offset 8160 ; (a * m + b * (64 - m) + 512) >> 10 8161 ; = ((a - b) * m + (b << 6) + 512) >> 10 8162 ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 8163 mova m3, [maskq+(%1+0)*(mmsize/2)] 8164 mova m0, [tmp2q+(%1+0)*mmsize] ; b 8165 psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a 8166 mova m6, m3 ; m 8167 psubb m3, m4, m6 ; -m 8168 paddw m1, m1 ; (b - a) << 1 8169 paddb m3, m3 ; -m << 1 8170 punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16) 8171 pmulhw m1, m2 ; (-m * (b - a)) << 10 8172 paddw m0, m1 ; + b 8173 mova m1, [tmp2q+(%1+1)*mmsize] ; b 8174 psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a 8175 paddw m2, m2 ; (b - a) << 1 8176 mova m6, m3 ; (-m << 1) 8177 punpckhbw m3, m4, m6 ; (-m << 9) 8178 pmulhw m2, m3 ; (-m << 9) 8179 paddw m1, m2 ; (-m * (b - a)) << 10 8180 pmulhrsw m0, m5 ; round 8181 pmulhrsw m1, m5 ; round 8182 packuswb m0, m1 ; interleave 16 -> 8 8183%endmacro 8184 8185%macro MASK_INC_PTR 1 8186 add maskq, %1*mmsize/2 8187 add tmp1q, %1*mmsize 8188 add tmp2q, %1*mmsize 8189%endmacro 8190 8191%if ARCH_X86_64 8192cglobal mask_8bpc, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3 8193 movifnidn hd, hm 8194%else 8195cglobal mask_8bpc, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 8196%define hd dword r5m 8197%endif 8198%define base r6-mask_ssse3_table 8199 LEA r6, mask_ssse3_table 8200 tzcnt wd, wm 8201 movsxd wq, dword [r6+wq*4] 8202 pxor m4, m4 8203 mova m5, [base+pw_2048] 8204 add wq, r6 8205 mov maskq, r6m 8206 BIDIR_FN MASK 8207%undef hd 8208 8209%macro W_MASK_420_END 1-* 8210%rep %0 8211 call .main 8212 paddw m2, [maskq+16*%1] 8213 mova [maskq+16*%1], m2 8214 mova [dstq+strideq*1+16*(2*%1+0)], m0 8215 call .main 8216 psubw m3, m7, m2 8217 psubw m1, m7, [maskq+16*%1] 8218 psubw m3, [dstq+strideq*1+16*(2*%1+1)] 8219 psrlw m1, 2 8220 psrlw m3, 2 8221 packuswb m1, m3 8222 mova [maskq+16*%1], m1 8223 mova [dstq+strideq*1+16*(2*%1+1)], m0 8224 %rotate 1 8225%endrep 8226%endmacro 8227 8228%if UNIX64 8229DECLARE_REG_TMP 7 8230%else 8231DECLARE_REG_TMP 5 8232%endif 8233 8234cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask 8235%define base t0-w_mask_420_ssse3_table 8236 LEA t0, w_mask_420_ssse3_table 8237 tzcnt wd, wm 8238 mov r6d, r7m ; sign 8239 sub tmp2q, tmp1q 8240 movsxd wq, [t0+wq*4] 8241 mova m6, [base+pw_2048] 8242 movddup m7, [base+wm_420_sign+r6*8] ; 258 - sign 8243 add wq, t0 8244%if ARCH_X86_64 8245 mova m8, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 8246 movifnidn hd, hm 8247%else 8248 %define m8 [base+pw_6903] 8249 %define hd dword hm 8250%endif 8251 mov maskq, maskmp 8252 call .main 8253 jmp wq 8254.w4_loop: 8255 call .main 8256 add maskq, 4 8257 lea dstq, [dstq+strideq*2] 8258.w4: 8259 pshufd m3, m2, q2020 8260 pshufd m2, m2, q3131 8261 psubw m1, m7, m3 8262 psubw m1, m2 8263 psrlw m1, 2 8264 packuswb m1, m1 8265 movd [maskq], m1 8266 movd [dstq+strideq*0], m0 8267 pshuflw m1, m0, q1032 8268 movd [dstq+strideq*1], m1 8269 punpckhqdq m0, m0 8270 lea dstq, [dstq+strideq*2] 8271 movd [dstq+strideq*0], m0 8272 pshuflw m1, m0, q1032 8273 movd [dstq+strideq*1], m1 8274 sub hd, 4 8275 jg .w4_loop 8276 RET 8277.w8_loop: 8278 call .main 8279 add maskq, 4 8280 lea dstq, [dstq+strideq*2] 8281.w8: 8282 movhlps m3, m2 8283 psubw m1, m7, m2 8284 psubw m1, m3 8285 psrlw m1, 2 8286 packuswb m1, m1 8287 movd [maskq], m1 8288 movq [dstq+strideq*0], m0 8289 movhps [dstq+strideq*1], m0 8290 sub hd, 2 8291 jg .w8_loop 8292 RET 8293.w16_loop: 8294 call .main 8295 add maskq, 8 8296 lea dstq, [dstq+strideq*2] 8297.w16: 8298 mova [dstq+strideq*1], m2 8299 mova [dstq+strideq*0], m0 8300 call .main 8301 psubw m1, m7, [dstq+strideq*1] 8302 psubw m1, m2 8303 psrlw m1, 2 8304 packuswb m1, m1 8305 movq [maskq], m1 8306 mova [dstq+strideq*1], m0 8307 sub hd, 2 8308 jg .w16_loop 8309 RET 8310.w32_loop: 8311 call .main 8312 add maskq, 16 8313 lea dstq, [dstq+strideq*2] 8314.w32: 8315 mova [maskq], m2 8316 mova [dstq+strideq*0+16*0], m0 8317 call .main 8318 mova [dstq+strideq*1+16*1], m2 8319 mova [dstq+strideq*0+16*1], m0 8320 W_MASK_420_END 0 8321 sub hd, 2 8322 jg .w32_loop 8323 RET 8324.w64_loop: 8325 call .main 8326 add maskq, 16*2 8327 lea dstq, [dstq+strideq*2] 8328.w64: 8329 mova [maskq+16*0], m2 8330 mova [dstq+strideq*0+16*0], m0 8331 call .main 8332 mova [dstq+strideq*1+16*1], m2 8333 mova [dstq+strideq*0+16*1], m0 8334 call .main 8335 mova [maskq+16*1], m2 8336 mova [dstq+strideq*0+16*2], m0 8337 call .main 8338 mova [dstq+strideq*1+16*3], m2 8339 mova [dstq+strideq*0+16*3], m0 8340 W_MASK_420_END 0, 1 8341 sub hd, 2 8342 jg .w64_loop 8343 RET 8344.w128_loop: 8345 call .main 8346 add maskq, 16*4 8347 lea dstq, [dstq+strideq*2] 8348.w128: 8349 mova [maskq+16*0], m2 8350 mova [dstq+strideq*0+16*0], m0 8351 call .main 8352 mova [dstq+strideq*1+16*1], m2 8353 mova [dstq+strideq*0+16*1], m0 8354 call .main 8355 mova [maskq+16*1], m2 8356 mova [dstq+strideq*0+16*2], m0 8357 call .main 8358 mova [dstq+strideq*1+16*3], m2 8359 mova [dstq+strideq*0+16*3], m0 8360 call .main 8361 mova [maskq+16*2], m2 8362 mova [dstq+strideq*0+16*4], m0 8363 call .main 8364 mova [dstq+strideq*1+16*5], m2 8365 mova [dstq+strideq*0+16*5], m0 8366 call .main 8367 mova [maskq+16*3], m2 8368 mova [dstq+strideq*0+16*6], m0 8369 call .main 8370 mova [dstq+strideq*1+16*7], m2 8371 mova [dstq+strideq*0+16*7], m0 8372 W_MASK_420_END 0, 1, 2, 3 8373 sub hd, 2 8374 jg .w128_loop 8375 RET 8376ALIGN function_align 8377.main: 8378 mova m0, [tmp1q +16*0] 8379 mova m3, [tmp1q+tmp2q+16*0] 8380 mova m1, [tmp1q +16*1] 8381 mova m4, [tmp1q+tmp2q+16*1] 8382 add tmp1q, 16*2 8383 psubw m3, m0 8384 psubw m4, m1 8385 pabsw m5, m3 8386 psubusw m2, m8, m5 8387 psrlw m2, 8 ; 64 - m 8388 psllw m5, m2, 10 8389 pmulhw m3, m5 8390 pabsw m5, m4 8391 paddw m0, m3 8392 psubusw m3, m8, m5 8393 psrlw m3, 8 8394 phaddw m2, m3 8395 psllw m3, 10 8396 pmulhw m4, m3 8397 paddw m1, m4 8398 pmulhrsw m0, m6 8399 pmulhrsw m1, m6 8400 packuswb m0, m1 8401 ret 8402 8403%macro W_MASK_422_BACKUP 1 ; mask_offset 8404%if ARCH_X86_64 8405 mova m10, m2 8406%else 8407 mova [maskq+16*%1], m2 8408%endif 8409%endmacro 8410 8411%macro W_MASK_422_END 1 ; mask_offset 8412%if ARCH_X86_64 8413 packuswb m10, m2 8414 psubb m1, m7, m10 8415 pavgb m1, m9 8416%else 8417 mova m3, [maskq+16*%1] 8418 packuswb m3, m2 8419 pxor m2, m2 8420 psubb m1, m7, m3 8421 pavgb m1, m2 8422%endif 8423 mova [maskq+16*%1], m1 8424%endmacro 8425 8426cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, tmp2, w, h, mask 8427%define base t0-w_mask_422_ssse3_table 8428 LEA t0, w_mask_422_ssse3_table 8429 tzcnt wd, wm 8430 mov r6d, r7m ; sign 8431 sub tmp2q, tmp1q 8432 movsxd wq, [t0+wq*4] 8433 mova m6, [base+pw_2048] 8434 movddup m7, [base+wm_422_sign+r6*8] ; 128 - sign 8435 add wq, t0 8436%if ARCH_X86_64 8437 mova m8, [base+pw_6903] 8438 pxor m9, m9 8439 movifnidn hd, hm 8440%else 8441 add t0, w_mask_420_ssse3_table-w_mask_422_ssse3_table 8442 %define hd dword hm 8443%endif 8444 mov maskq, maskmp 8445 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8446 jmp wq 8447.w4_loop: 8448 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8449 add maskq, 8 8450 lea dstq, [dstq+strideq*2] 8451.w4: 8452 packuswb m2, m2 8453 psubb m1, m7, m2 8454%if ARCH_X86_64 8455 pavgb m1, m9 8456%else 8457 pxor m2, m2 8458 pavgb m1, m2 8459%endif 8460 movq [maskq], m1 8461 movd [dstq+strideq*0], m0 8462 pshuflw m1, m0, q1032 8463 movd [dstq+strideq*1], m1 8464 punpckhqdq m0, m0 8465 lea dstq, [dstq+strideq*2] 8466 movd [dstq+strideq*0], m0 8467 pshuflw m1, m0, q1032 8468 movd [dstq+strideq*1], m1 8469 sub hd, 4 8470 jg .w4_loop 8471 RET 8472.w8_loop: 8473 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8474 add maskq, 16 8475 lea dstq, [dstq+strideq*2] 8476.w8: 8477 W_MASK_422_BACKUP 0 8478 movq [dstq+strideq*0], m0 8479 movhps [dstq+strideq*1], m0 8480 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8481 lea dstq, [dstq+strideq*2] 8482 W_MASK_422_END 0 8483 movq [dstq+strideq*0], m0 8484 movhps [dstq+strideq*1], m0 8485 sub hd, 4 8486 jg .w8_loop 8487 RET 8488.w16_loop: 8489 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8490 add maskq, 16 8491 lea dstq, [dstq+strideq*2] 8492.w16: 8493 W_MASK_422_BACKUP 0 8494 mova [dstq+strideq*0], m0 8495 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8496 W_MASK_422_END 0 8497 mova [dstq+strideq*1], m0 8498 sub hd, 2 8499 jg .w16_loop 8500 RET 8501.w32_loop: 8502 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8503 add maskq, 16 8504 add dstq, strideq 8505.w32: 8506 W_MASK_422_BACKUP 0 8507 mova [dstq+16*0], m0 8508 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8509 W_MASK_422_END 0 8510 mova [dstq+16*1], m0 8511 dec hd 8512 jg .w32_loop 8513 RET 8514.w64_loop: 8515 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8516 add maskq, 16*2 8517 add dstq, strideq 8518.w64: 8519 W_MASK_422_BACKUP 0 8520 mova [dstq+16*0], m0 8521 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8522 W_MASK_422_END 0 8523 mova [dstq+16*1], m0 8524 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8525 W_MASK_422_BACKUP 1 8526 mova [dstq+16*2], m0 8527 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8528 W_MASK_422_END 1 8529 mova [dstq+16*3], m0 8530 dec hd 8531 jg .w64_loop 8532 RET 8533.w128_loop: 8534 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8535 add maskq, 16*4 8536 add dstq, strideq 8537.w128: 8538 W_MASK_422_BACKUP 0 8539 mova [dstq+16*0], m0 8540 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8541 W_MASK_422_END 0 8542 mova [dstq+16*1], m0 8543 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8544 W_MASK_422_BACKUP 1 8545 mova [dstq+16*2], m0 8546 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8547 W_MASK_422_END 1 8548 mova [dstq+16*3], m0 8549 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8550 W_MASK_422_BACKUP 2 8551 mova [dstq+16*4], m0 8552 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8553 W_MASK_422_END 2 8554 mova [dstq+16*5], m0 8555 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8556 W_MASK_422_BACKUP 3 8557 mova [dstq+16*6], m0 8558 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 8559 W_MASK_422_END 3 8560 mova [dstq+16*7], m0 8561 dec hd 8562 jg .w128_loop 8563 RET 8564 8565cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask 8566%define base t0-w_mask_444_ssse3_table 8567 LEA t0, w_mask_444_ssse3_table 8568 tzcnt wd, wm 8569 mov maskq, maskmp 8570 sub tmp2q, tmp1q 8571 movsxd wq, [t0+wq*4] 8572 mova m6, [base+pw_6903] 8573 mova m7, [base+pw_2048] 8574 add wq, t0 8575%if ARCH_X86_64 8576 mova m8, [base+pb_64] 8577 movifnidn hd, hm 8578%else 8579 %define m8 [base+pb_64] 8580 %define hd dword hm 8581%endif 8582 call .main 8583 jmp wq 8584.w4_loop: 8585 call .main 8586 lea dstq, [dstq+strideq*2] 8587.w4: 8588 movd [dstq+strideq*0], m0 8589 pshuflw m1, m0, q1032 8590 movd [dstq+strideq*1], m1 8591 punpckhqdq m0, m0 8592 lea dstq, [dstq+strideq*2] 8593 movd [dstq+strideq*0], m0 8594 pshuflw m1, m0, q1032 8595 movd [dstq+strideq*1], m1 8596 sub hd, 4 8597 jg .w4_loop 8598 RET 8599.w8_loop: 8600 call .main 8601 lea dstq, [dstq+strideq*2] 8602.w8: 8603 movq [dstq+strideq*0], m0 8604 movhps [dstq+strideq*1], m0 8605 sub hd, 2 8606 jg .w8_loop 8607 RET 8608.w16_loop: 8609 call .main 8610 lea dstq, [dstq+strideq*2] 8611.w16: 8612 mova [dstq+strideq*0], m0 8613 call .main 8614 mova [dstq+strideq*1], m0 8615 sub hd, 2 8616 jg .w16_loop 8617 RET 8618.w32_loop: 8619 call .main 8620 add dstq, strideq 8621.w32: 8622 mova [dstq+16*0], m0 8623 call .main 8624 mova [dstq+16*1], m0 8625 dec hd 8626 jg .w32_loop 8627 RET 8628.w64_loop: 8629 call .main 8630 add dstq, strideq 8631.w64: 8632 mova [dstq+16*0], m0 8633 call .main 8634 mova [dstq+16*1], m0 8635 call .main 8636 mova [dstq+16*2], m0 8637 call .main 8638 mova [dstq+16*3], m0 8639 dec hd 8640 jg .w64_loop 8641 RET 8642.w128_loop: 8643 call .main 8644 add dstq, strideq 8645.w128: 8646 mova [dstq+16*0], m0 8647 call .main 8648 mova [dstq+16*1], m0 8649 call .main 8650 mova [dstq+16*2], m0 8651 call .main 8652 mova [dstq+16*3], m0 8653 call .main 8654 mova [dstq+16*4], m0 8655 call .main 8656 mova [dstq+16*5], m0 8657 call .main 8658 mova [dstq+16*6], m0 8659 call .main 8660 mova [dstq+16*7], m0 8661 dec hd 8662 jg .w128_loop 8663 RET 8664ALIGN function_align 8665.main: 8666 mova m0, [tmp1q +16*0] 8667 mova m3, [tmp1q+tmp2q+16*0] 8668 mova m1, [tmp1q +16*1] 8669 mova m4, [tmp1q+tmp2q+16*1] 8670 add tmp1q, 16*2 8671 psubw m3, m0 8672 psubw m4, m1 8673 pabsw m5, m3 8674 psubusw m2, m6, m5 8675 psrlw m2, 8 ; 64 - m 8676 psllw m5, m2, 10 8677 pmulhw m3, m5 8678 pabsw m5, m4 8679 paddw m0, m3 8680 psubusw m3, m6, m5 8681 psrlw m3, 8 8682 packuswb m2, m3 8683 psllw m3, 10 8684 pmulhw m4, m3 8685 psubb m3, m8, m2 8686 paddw m1, m4 8687 pmulhrsw m0, m7 8688 pmulhrsw m1, m7 8689 mova [maskq], m3 8690 add maskq, 16 8691 packuswb m0, m1 8692 ret 8693 8694%macro BLEND_64M 4; a, b, mask1, mask2 8695 punpcklbw m0, %1, %2; {b;a}[7..0] 8696 punpckhbw %1, %2 ; {b;a}[15..8] 8697 pmaddubsw m0, %3 ; {b*m[0] + (64-m[0])*a}[7..0] u16 8698 pmaddubsw %1, %4 ; {b*m[1] + (64-m[1])*a}[15..8] u16 8699 pmulhrsw m0, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16 8700 pmulhrsw %1, m5 ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16 8701 packuswb m0, %1 ; {blendpx}[15..0] u8 8702%endmacro 8703 8704%macro BLEND 2; a, b 8705 psubb m3, m4, m0 ; m3 = (64 - m) 8706 punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0] 8707 punpckhbw m3, m0 ; {m;(64-m)}[15..8] 8708 BLEND_64M %1, %2, m2, m3 8709%endmacro 8710 8711cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask 8712%define base r6-blend_ssse3_table 8713 LEA r6, blend_ssse3_table 8714 tzcnt wd, wm 8715 movifnidn hd, hm 8716 movifnidn maskq, maskmp 8717 movsxd wq, dword [r6+wq*4] 8718 mova m4, [base+pb_64] 8719 mova m5, [base+pw_512] 8720 add wq, r6 8721 lea r6, [dsq*3] 8722 jmp wq 8723.w4: 8724 movq m0, [maskq]; m 8725 movd m1, [dstq+dsq*0] ; a 8726 movd m6, [dstq+dsq*1] 8727 punpckldq m1, m6 8728 movq m6, [tmpq] ; b 8729 psubb m3, m4, m0 ; m3 = (64 - m) 8730 punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0] 8731 punpcklbw m1, m6 ; {b;a}[7..0] 8732 pmaddubsw m1, m2 ; {b*m[0] + (64-m[0])*a}[7..0] u16 8733 pmulhrsw m1, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16 8734 packuswb m1, m0 ; {blendpx}[15..0] u8 8735 movd [dstq+dsq*0], m1 8736 psrlq m1, 32 8737 movd [dstq+dsq*1], m1 8738 add maskq, 8 8739 add tmpq, 8 8740 lea dstq, [dstq+dsq*2] ; dst_stride * 2 8741 sub hd, 2 8742 jg .w4 8743 RET 8744.w8: 8745 mova m0, [maskq]; m 8746 movq m1, [dstq+dsq*0] ; a 8747 movhps m1, [dstq+dsq*1] 8748 mova m6, [tmpq] ; b 8749 BLEND m1, m6 8750 movq [dstq+dsq*0], m0 8751 movhps [dstq+dsq*1], m0 8752 add maskq, 16 8753 add tmpq, 16 8754 lea dstq, [dstq+dsq*2] ; dst_stride * 2 8755 sub hd, 2 8756 jg .w8 8757 RET 8758.w16: 8759 mova m0, [maskq]; m 8760 mova m1, [dstq] ; a 8761 mova m6, [tmpq] ; b 8762 BLEND m1, m6 8763 mova [dstq], m0 8764 add maskq, 16 8765 add tmpq, 16 8766 add dstq, dsq ; dst_stride 8767 dec hd 8768 jg .w16 8769 RET 8770.w32: 8771 %assign i 0 8772 %rep 2 8773 mova m0, [maskq+16*i]; m 8774 mova m1, [dstq+16*i] ; a 8775 mova m6, [tmpq+16*i] ; b 8776 BLEND m1, m6 8777 mova [dstq+i*16], m0 8778 %assign i i+1 8779 %endrep 8780 add maskq, 32 8781 add tmpq, 32 8782 add dstq, dsq ; dst_stride 8783 dec hd 8784 jg .w32 8785 RET 8786 8787cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask 8788%define base r5-blend_v_ssse3_table 8789 LEA r5, blend_v_ssse3_table 8790 tzcnt wd, wm 8791 movifnidn hd, hm 8792 movsxd wq, dword [r5+wq*4] 8793 mova m5, [base+pw_512] 8794 add wq, r5 8795 add maskq, obmc_masks-blend_v_ssse3_table 8796 jmp wq 8797.w2: 8798 movd m3, [maskq+4] 8799 punpckldq m3, m3 8800 ; 2 mask blend is provided for 4 pixels / 2 lines 8801.w2_loop: 8802 movd m1, [dstq+dsq*0] ; a {..;a;a} 8803 pinsrw m1, [dstq+dsq*1], 1 8804 movd m2, [tmpq] ; b 8805 punpcklbw m0, m1, m2; {b;a}[7..0] 8806 pmaddubsw m0, m3 ; {b*m + (64-m)*a}[7..0] u16 8807 pmulhrsw m0, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16 8808 packuswb m0, m1 ; {blendpx}[8..0] u8 8809 movd r3d, m0 8810 mov [dstq+dsq*0], r3w 8811 shr r3d, 16 8812 mov [dstq+dsq*1], r3w 8813 add tmpq, 2*2 8814 lea dstq, [dstq + dsq * 2] 8815 sub hd, 2 8816 jg .w2_loop 8817 RET 8818.w4: 8819 movddup m3, [maskq+8] 8820 ; 4 mask blend is provided for 8 pixels / 2 lines 8821.w4_loop: 8822 movd m1, [dstq+dsq*0] ; a 8823 movd m2, [dstq+dsq*1] ; 8824 punpckldq m1, m2 8825 movq m2, [tmpq] ; b 8826 punpcklbw m1, m2 ; {b;a}[7..0] 8827 pmaddubsw m1, m3 ; {b*m + (64-m)*a}[7..0] u16 8828 pmulhrsw m1, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16 8829 packuswb m1, m1 ; {blendpx}[8..0] u8 8830 movd [dstq], m1 8831 psrlq m1, 32 8832 movd [dstq+dsq*1], m1 8833 add tmpq, 2*4 8834 lea dstq, [dstq+dsq*2] 8835 sub hd, 2 8836 jg .w4_loop 8837 RET 8838.w8: 8839 mova m3, [maskq+16] 8840 ; 8 mask blend is provided for 16 pixels 8841.w8_loop: 8842 movq m1, [dstq+dsq*0] ; a 8843 movhps m1, [dstq+dsq*1] 8844 mova m2, [tmpq]; b 8845 BLEND_64M m1, m2, m3, m3 8846 movq [dstq+dsq*0], m0 8847 movhps [dstq+dsq*1], m0 8848 add tmpq, 16 8849 lea dstq, [dstq+dsq*2] 8850 sub hd, 2 8851 jg .w8_loop 8852 RET 8853.w16: 8854 ; 16 mask blend is provided for 32 pixels 8855 mova m3, [maskq+32] ; obmc_masks_16[0] (64-m[0]) 8856 mova m4, [maskq+48] ; obmc_masks_16[1] (64-m[1]) 8857.w16_loop: 8858 mova m1, [dstq] ; a 8859 mova m2, [tmpq] ; b 8860 BLEND_64M m1, m2, m3, m4 8861 mova [dstq], m0 8862 add tmpq, 16 8863 add dstq, dsq 8864 dec hd 8865 jg .w16_loop 8866 RET 8867.w32: 8868%if WIN64 8869 mova [rsp+8], xmm6 8870%endif 8871 mova m3, [maskq+64] ; obmc_masks_32[0] (64-m[0]) 8872 mova m4, [maskq+80] ; obmc_masks_32[1] (64-m[1]) 8873 mova m6, [maskq+96] ; obmc_masks_32[2] (64-m[2]) 8874 ; 16 mask blend is provided for 64 pixels 8875.w32_loop: 8876 mova m1, [dstq+16*0] ; a 8877 mova m2, [tmpq+16*0] ; b 8878 BLEND_64M m1, m2, m3, m4 8879 movq m1, [dstq+16*1] ; a 8880 punpcklbw m1, [tmpq+16*1] ; b 8881 pmaddubsw m1, m6 8882 pmulhrsw m1, m5 8883 packuswb m1, m1 8884 mova [dstq+16*0], m0 8885 movq [dstq+16*1], m1 8886 add tmpq, 32 8887 add dstq, dsq 8888 dec hd 8889 jg .w32_loop 8890%if WIN64 8891 mova xmm6, [rsp+8] 8892%endif 8893 RET 8894 8895cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask 8896%define base t0-blend_h_ssse3_table 8897%if ARCH_X86_32 8898 ; We need to keep the PIC pointer for w4, reload wd from stack instead 8899 DECLARE_REG_TMP 6 8900%else 8901 DECLARE_REG_TMP 5 8902 mov r6d, wd 8903%endif 8904 LEA t0, blend_h_ssse3_table 8905 tzcnt wd, wm 8906 mov hd, hm 8907 movsxd wq, dword [t0+wq*4] 8908 mova m5, [base+pw_512] 8909 add wq, t0 8910 lea maskq, [base+obmc_masks+hq*2] 8911 lea hd, [hq*3] 8912 shr hd, 2 ; h * 3/4 8913 lea maskq, [maskq+hq*2] 8914 neg hq 8915 jmp wq 8916.w2: 8917 movd m0, [dstq+dsq*0] 8918 pinsrw m0, [dstq+dsq*1], 1 8919 movd m2, [maskq+hq*2] 8920 movd m1, [tmpq] 8921 punpcklwd m2, m2 8922 punpcklbw m0, m1 8923 pmaddubsw m0, m2 8924 pmulhrsw m0, m5 8925 packuswb m0, m0 8926 movd r3d, m0 8927 mov [dstq+dsq*0], r3w 8928 shr r3d, 16 8929 mov [dstq+dsq*1], r3w 8930 lea dstq, [dstq+dsq*2] 8931 add tmpq, 2*2 8932 add hq, 2 8933 jl .w2 8934 RET 8935.w4: 8936%if ARCH_X86_32 8937 mova m3, [base+blend_shuf] 8938%else 8939 mova m3, [blend_shuf] 8940%endif 8941.w4_loop: 8942 movd m0, [dstq+dsq*0] 8943 movd m2, [dstq+dsq*1] 8944 punpckldq m0, m2 ; a 8945 movq m1, [tmpq] ; b 8946 movq m2, [maskq+hq*2] ; m 8947 pshufb m2, m3 8948 punpcklbw m0, m1 8949 pmaddubsw m0, m2 8950 pmulhrsw m0, m5 8951 packuswb m0, m0 8952 movd [dstq+dsq*0], m0 8953 psrlq m0, 32 8954 movd [dstq+dsq*1], m0 8955 lea dstq, [dstq+dsq*2] 8956 add tmpq, 4*2 8957 add hq, 2 8958 jl .w4_loop 8959 RET 8960.w8: 8961 movd m4, [maskq+hq*2] 8962 punpcklwd m4, m4 8963 pshufd m3, m4, q0000 8964 pshufd m4, m4, q1111 8965 movq m1, [dstq+dsq*0] ; a 8966 movhps m1, [dstq+dsq*1] 8967 mova m2, [tmpq] 8968 BLEND_64M m1, m2, m3, m4 8969 movq [dstq+dsq*0], m0 8970 movhps [dstq+dsq*1], m0 8971 lea dstq, [dstq+dsq*2] 8972 add tmpq, 8*2 8973 add hq, 2 8974 jl .w8 8975 RET 8976; w16/w32/w64/w128 8977.w16: 8978%if ARCH_X86_32 8979 mov r6d, wm 8980%endif 8981 sub dsq, r6 8982.w16_loop0: 8983 movd m3, [maskq+hq*2] 8984 pshuflw m3, m3, q0000 8985 punpcklqdq m3, m3 8986 mov wd, r6d 8987.w16_loop: 8988 mova m1, [dstq] ; a 8989 mova m2, [tmpq] ; b 8990 BLEND_64M m1, m2, m3, m3 8991 mova [dstq], m0 8992 add dstq, 16 8993 add tmpq, 16 8994 sub wd, 16 8995 jg .w16_loop 8996 add dstq, dsq 8997 inc hq 8998 jl .w16_loop0 8999 RET 9000 9001; emu_edge args: 9002; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, 9003; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, 9004; const pixel *ref, const ptrdiff_t ref_stride 9005; 9006; bw, bh total filled size 9007; iw, ih, copied block -> fill bottom, right 9008; x, y, offset in bw/bh -> fill top, left 9009cglobal emu_edge_8bpc, 10, 13, 2, bw, bh, iw, ih, x, \ 9010 y, dst, dstride, src, sstride, \ 9011 bottomext, rightext, blk 9012 ; we assume that the buffer (stride) is larger than width, so we can 9013 ; safely overwrite by a few bytes 9014 pxor m1, m1 9015 9016%if ARCH_X86_64 9017 %define reg_zero r12q 9018 %define reg_tmp r10 9019 %define reg_src srcq 9020 %define reg_bottomext bottomextq 9021 %define reg_rightext rightextq 9022 %define reg_blkm r9m 9023%else 9024 %define reg_zero r6 9025 %define reg_tmp r0 9026 %define reg_src r1 9027 %define reg_bottomext r0 9028 %define reg_rightext r1 9029 %define reg_blkm r2m 9030%endif 9031 ; 9032 ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 9033 xor reg_zero, reg_zero 9034 lea reg_tmp, [ihq-1] 9035 cmp yq, ihq 9036 cmovs reg_tmp, yq 9037 test yq, yq 9038 cmovs reg_tmp, reg_zero 9039%if ARCH_X86_64 9040 imul reg_tmp, sstrideq 9041 add srcq, reg_tmp 9042%else 9043 imul reg_tmp, sstridem 9044 mov reg_src, srcm 9045 add reg_src, reg_tmp 9046%endif 9047 ; 9048 ; ref += iclip(x, 0, iw - 1) 9049 lea reg_tmp, [iwq-1] 9050 cmp xq, iwq 9051 cmovs reg_tmp, xq 9052 test xq, xq 9053 cmovs reg_tmp, reg_zero 9054 add reg_src, reg_tmp 9055%if ARCH_X86_32 9056 mov srcm, reg_src 9057%endif 9058 ; 9059 ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) 9060%if ARCH_X86_32 9061 mov r1, r1m ; restore bh 9062%endif 9063 lea reg_bottomext, [yq+bhq] 9064 sub reg_bottomext, ihq 9065 lea r3, [bhq-1] 9066 cmovs reg_bottomext, reg_zero 9067 ; 9068 9069 DEFINE_ARGS bw, bh, iw, ih, x, \ 9070 topext, dst, dstride, src, sstride, \ 9071 bottomext, rightext, blk 9072 9073 ; top_ext = iclip(-y, 0, bh - 1) 9074 neg topextq 9075 cmovs topextq, reg_zero 9076 cmp reg_bottomext, bhq 9077 cmovns reg_bottomext, r3 9078 cmp topextq, bhq 9079 cmovg topextq, r3 9080 %if ARCH_X86_32 9081 mov r4m, reg_bottomext 9082 ; 9083 ; right_ext = iclip(x + bw - iw, 0, bw - 1) 9084 mov r0, r0m ; restore bw 9085 %endif 9086 lea reg_rightext, [xq+bwq] 9087 sub reg_rightext, iwq 9088 lea r2, [bwq-1] 9089 cmovs reg_rightext, reg_zero 9090 9091 DEFINE_ARGS bw, bh, iw, ih, leftext, \ 9092 topext, dst, dstride, src, sstride, \ 9093 bottomext, rightext, blk 9094 9095 ; left_ext = iclip(-x, 0, bw - 1) 9096 neg leftextq 9097 cmovs leftextq, reg_zero 9098 cmp reg_rightext, bwq 9099 cmovns reg_rightext, r2 9100 %if ARCH_X86_32 9101 mov r3m, r1 9102 %endif 9103 cmp leftextq, bwq 9104 cmovns leftextq, r2 9105 9106%undef reg_zero 9107%undef reg_tmp 9108%undef reg_src 9109%undef reg_bottomext 9110%undef reg_rightext 9111 9112 DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ 9113 topext, dst, dstride, src, sstride, \ 9114 bottomext, rightext, blk 9115 9116 ; center_h = bh - top_ext - bottom_ext 9117%if ARCH_X86_64 9118 lea r3, [bottomextq+topextq] 9119 sub centerhq, r3 9120%else 9121 mov r1, centerhm ; restore r1 9122 sub centerhq, topextq 9123 sub centerhq, r4m 9124 mov r1m, centerhq 9125%endif 9126 ; 9127 ; blk += top_ext * PXSTRIDE(dst_stride) 9128 mov r2, topextq 9129%if ARCH_X86_64 9130 imul r2, dstrideq 9131%else 9132 mov r6, r6m ; restore dstq 9133 imul r2, dstridem 9134%endif 9135 add dstq, r2 9136 mov reg_blkm, dstq ; save pointer for ext 9137 ; 9138 ; center_w = bw - left_ext - right_ext 9139 mov centerwq, bwq 9140%if ARCH_X86_64 9141 lea r3, [rightextq+leftextq] 9142 sub centerwq, r3 9143%else 9144 sub centerwq, r3m 9145 sub centerwq, leftextq 9146%endif 9147 9148; vloop Macro 9149%macro v_loop 3 ; need_left_ext, need_right_ext, suffix 9150 %if ARCH_X86_64 9151 %define reg_tmp r12 9152 %else 9153 %define reg_tmp r0 9154 %endif 9155.v_loop_%3: 9156 %if ARCH_X86_32 9157 mov r0, r0m 9158 mov r1, r1m 9159 %endif 9160%if %1 9161 ; left extension 9162 %if ARCH_X86_64 9163 movd m0, [srcq] 9164 %else 9165 mov r3, srcm 9166 movd m0, [r3] 9167 %endif 9168 pshufb m0, m1 9169 xor r3, r3 9170.left_loop_%3: 9171 mova [dstq+r3], m0 9172 add r3, mmsize 9173 cmp r3, leftextq 9174 jl .left_loop_%3 9175 ; body 9176 lea reg_tmp, [dstq+leftextq] 9177%endif 9178 xor r3, r3 9179.body_loop_%3: 9180 %if ARCH_X86_64 9181 movu m0, [srcq+r3] 9182 %else 9183 mov r1, srcm 9184 movu m0, [r1+r3] 9185 %endif 9186%if %1 9187 movu [reg_tmp+r3], m0 9188%else 9189 movu [dstq+r3], m0 9190%endif 9191 add r3, mmsize 9192 cmp r3, centerwq 9193 jl .body_loop_%3 9194%if %2 9195 ; right extension 9196%if %1 9197 add reg_tmp, centerwq 9198%else 9199 lea reg_tmp, [dstq+centerwq] 9200%endif 9201 %if ARCH_X86_64 9202 movd m0, [srcq+centerwq-1] 9203 %else 9204 mov r3, srcm 9205 movd m0, [r3+centerwq-1] 9206 %endif 9207 pshufb m0, m1 9208 xor r3, r3 9209.right_loop_%3: 9210 movu [reg_tmp+r3], m0 9211 add r3, mmsize 9212 %if ARCH_X86_64 9213 cmp r3, rightextq 9214 %else 9215 cmp r3, r3m 9216 %endif 9217 jl .right_loop_%3 9218%endif 9219 %if ARCH_X86_64 9220 add dstq, dstrideq 9221 add srcq, sstrideq 9222 dec centerhq 9223 jg .v_loop_%3 9224 %else 9225 add dstq, dstridem 9226 mov r0, sstridem 9227 add srcm, r0 9228 sub dword centerhm, 1 9229 jg .v_loop_%3 9230 mov r0, r0m ; restore r0 9231 %endif 9232%endmacro ; vloop MACRO 9233 9234 test leftextq, leftextq 9235 jnz .need_left_ext 9236 %if ARCH_X86_64 9237 test rightextq, rightextq 9238 jnz .need_right_ext 9239 %else 9240 cmp leftextq, r3m ; leftextq == 0 9241 jne .need_right_ext 9242 %endif 9243 v_loop 0, 0, 0 9244 jmp .body_done 9245 9246 ;left right extensions 9247.need_left_ext: 9248 %if ARCH_X86_64 9249 test rightextq, rightextq 9250 %else 9251 mov r3, r3m 9252 test r3, r3 9253 %endif 9254 jnz .need_left_right_ext 9255 v_loop 1, 0, 1 9256 jmp .body_done 9257 9258.need_left_right_ext: 9259 v_loop 1, 1, 2 9260 jmp .body_done 9261 9262.need_right_ext: 9263 v_loop 0, 1, 3 9264 9265.body_done: 9266; r0 ; bw 9267; r1 ;; x loop 9268; r4 ;; y loop 9269; r5 ; topextq 9270; r6 ;dstq 9271; r7 ;dstrideq 9272; r8 ; srcq 9273%if ARCH_X86_64 9274 %define reg_dstride dstrideq 9275%else 9276 %define reg_dstride r2 9277%endif 9278 ; 9279 ; bottom edge extension 9280 %if ARCH_X86_64 9281 test bottomextq, bottomextq 9282 jz .top 9283 %else 9284 xor r1, r1 9285 cmp r1, r4m 9286 je .top 9287 %endif 9288 ; 9289 %if ARCH_X86_64 9290 mov srcq, dstq 9291 sub srcq, dstrideq 9292 xor r1, r1 9293 %else 9294 mov r3, dstq 9295 mov reg_dstride, dstridem 9296 sub r3, reg_dstride 9297 mov srcm, r3 9298 %endif 9299 ; 9300.bottom_x_loop: 9301 %if ARCH_X86_64 9302 mova m0, [srcq+r1] 9303 lea r3, [dstq+r1] 9304 mov r4, bottomextq 9305 %else 9306 mov r3, srcm 9307 mova m0, [r3+r1] 9308 lea r3, [dstq+r1] 9309 mov r4, r4m 9310 %endif 9311 ; 9312.bottom_y_loop: 9313 mova [r3], m0 9314 add r3, reg_dstride 9315 dec r4 9316 jg .bottom_y_loop 9317 add r1, mmsize 9318 cmp r1, bwq 9319 jl .bottom_x_loop 9320 9321.top: 9322 ; top edge extension 9323 test topextq, topextq 9324 jz .end 9325%if ARCH_X86_64 9326 mov srcq, reg_blkm 9327%else 9328 mov r3, reg_blkm 9329 mov reg_dstride, dstridem 9330%endif 9331 mov dstq, dstm 9332 xor r1, r1 9333 ; 9334.top_x_loop: 9335%if ARCH_X86_64 9336 mova m0, [srcq+r1] 9337%else 9338 mov r3, reg_blkm 9339 mova m0, [r3+r1] 9340%endif 9341 lea r3, [dstq+r1] 9342 mov r4, topextq 9343 ; 9344.top_y_loop: 9345 mova [r3], m0 9346 add r3, reg_dstride 9347 dec r4 9348 jg .top_y_loop 9349 add r1, mmsize 9350 cmp r1, bwq 9351 jl .top_x_loop 9352 9353.end: 9354 RET 9355 9356%undef reg_dstride 9357%undef reg_blkm 9358%undef reg_tmp 9359 9360cextern resize_filter 9361 9362%macro SCRATCH 3 9363%if ARCH_X86_32 9364 mova [rsp+%3*mmsize], m%1 9365%define m%2 [rsp+%3*mmsize] 9366%else 9367 SWAP %1, %2 9368%endif 9369%endmacro 9370 9371%if ARCH_X86_64 9372cglobal resize_8bpc, 0, 12, 14, dst, dst_stride, src, src_stride, \ 9373 dst_w, h, src_w, dx, mx0 9374%elif STACK_ALIGNMENT >= 16 9375cglobal resize_8bpc, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \ 9376 dst_w, h, src_w, dx, mx0 9377%else 9378cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \ 9379 dst_w, h, src_w, dx, mx0 9380%endif 9381 movifnidn dstq, dstmp 9382 movifnidn srcq, srcmp 9383%if STACK_ALIGNMENT >= 16 9384 movifnidn dst_wd, dst_wm 9385%endif 9386%if ARCH_X86_64 9387 movifnidn hd, hm 9388%endif 9389 sub dword mx0m, 4<<14 9390 sub dword src_wm, 8 9391 movd m7, dxm 9392 movd m6, mx0m 9393 movd m5, src_wm 9394 pshufd m7, m7, q0000 9395 pshufd m6, m6, q0000 9396 pshufd m5, m5, q0000 9397 9398%if ARCH_X86_64 9399 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x 9400 LEA r7, $$ 9401%define base r7-$$ 9402%else 9403 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x 9404%define hd dword r5m 9405%if STACK_ALIGNMENT >= 16 9406 LEA r6, $$ 9407%define base r6-$$ 9408%else 9409 LEA r4, $$ 9410%define base r4-$$ 9411%endif 9412%endif 9413 9414%if ARCH_X86_64 9415 mova m10, [base+pw_m256] 9416 mova m9, [base+pd_63] 9417 mova m8, [base+pb_8x0_8x8] 9418%else 9419%define m10 [base+pw_m256] 9420%define m9 [base+pd_63] 9421%define m8 [base+pb_8x0_8x8] 9422%endif 9423 pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3] 9424 pslld m7, 2 ; dx*4 9425 pslld m5, 14 9426 paddd m6, m4 ; mx+[0..3]*dx 9427 SCRATCH 7, 13, 0 9428 SCRATCH 6, 12, 1 9429 SCRATCH 5, 11, 2 9430 9431 ; m10 = pmulhrsw constant for x=(x+64)>>7 9432 ; m12 = mx+[0..3]*dx, m13 = dx*4, m11 = src_w, m9 = 0x3f, m8=0,8 9433 9434.loop_y: 9435 xor xd, xd 9436 mova m0, m12 ; per-line working version of mx 9437 9438.loop_x: 9439 pxor m1, m1 9440 pcmpgtd m1, m0 9441 pandn m1, m0 9442 psrad m2, m0, 8 ; filter offset (unmasked) 9443 pcmpgtd m3, m11, m1 9444 pand m1, m3 9445 pandn m3, m11 9446 por m1, m3 9447 psubd m3, m0, m1 ; pshufb offset 9448 psrad m1, 14 ; clipped src_x offset 9449 psrad m3, 14 ; pshufb edge_emu offset 9450 pand m2, m9 ; filter offset (masked) 9451 9452 ; load source pixels 9453%if ARCH_X86_64 9454 movd r8d, m1 9455 pshuflw m1, m1, q3232 9456 movd r9d, m1 9457 punpckhqdq m1, m1 9458 movd r10d, m1 9459 psrlq m1, 32 9460 movd r11d, m1 9461 movq m4, [srcq+r8] 9462 movq m5, [srcq+r10] 9463 movhps m4, [srcq+r9] 9464 movhps m5, [srcq+r11] 9465%else 9466 movd r3d, m1 9467 pshufd m1, m1, q3312 9468 movd r1d, m1 9469 pshuflw m1, m1, q3232 9470 movq m4, [srcq+r3] 9471 movq m5, [srcq+r1] 9472 movd r3d, m1 9473 punpckhqdq m1, m1 9474 movd r1d, m1 9475 movhps m4, [srcq+r3] 9476 movhps m5, [srcq+r1] 9477%endif 9478 9479 ; if no emulation is required, we don't need to shuffle or emulate edges 9480 ; this also saves 2 quasi-vpgatherdqs 9481 pxor m6, m6 9482 pcmpeqb m6, m3 9483%if ARCH_X86_64 9484 pmovmskb r8d, m6 9485 cmp r8d, 0xffff 9486%else 9487 pmovmskb r3d, m6 9488 cmp r3d, 0xffff 9489%endif 9490 je .filter 9491 9492%if ARCH_X86_64 9493 movd r8d, m3 9494 pshuflw m3, m3, q3232 9495 movd r9d, m3 9496 punpckhqdq m3, m3 9497 movd r10d, m3 9498 psrlq m3, 32 9499 movd r11d, m3 9500 movsxd r8, r8d 9501 movsxd r9, r9d 9502 movsxd r10, r10d 9503 movsxd r11, r11d 9504 movq m6, [base+resize_shuf+4+r8] 9505 movq m7, [base+resize_shuf+4+r10] 9506 movhps m6, [base+resize_shuf+4+r9] 9507 movhps m7, [base+resize_shuf+4+r11] 9508%else 9509 movd r3d, m3 9510 pshufd m3, m3, q3312 9511 movd r1d, m3 9512 pshuflw m3, m3, q3232 9513 movq m6, [base+resize_shuf+4+r3] 9514 movq m7, [base+resize_shuf+4+r1] 9515 movd r3d, m3 9516 punpckhqdq m3, m3 9517 movd r1d, m3 9518 movhps m6, [base+resize_shuf+4+r3] 9519 movhps m7, [base+resize_shuf+4+r1] 9520%endif 9521 9522 paddb m6, m8 9523 paddb m7, m8 9524 pshufb m4, m6 9525 pshufb m5, m7 9526 9527.filter: 9528%if ARCH_X86_64 9529 movd r8d, m2 9530 pshuflw m2, m2, q3232 9531 movd r9d, m2 9532 punpckhqdq m2, m2 9533 movd r10d, m2 9534 psrlq m2, 32 9535 movd r11d, m2 9536 movq m6, [base+resize_filter+r8*8] 9537 movq m7, [base+resize_filter+r10*8] 9538 movhps m6, [base+resize_filter+r9*8] 9539 movhps m7, [base+resize_filter+r11*8] 9540%else 9541 movd r3d, m2 9542 pshufd m2, m2, q3312 9543 movd r1d, m2 9544 pshuflw m2, m2, q3232 9545 movq m6, [base+resize_filter+r3*8] 9546 movq m7, [base+resize_filter+r1*8] 9547 movd r3d, m2 9548 punpckhqdq m2, m2 9549 movd r1d, m2 9550 movhps m6, [base+resize_filter+r3*8] 9551 movhps m7, [base+resize_filter+r1*8] 9552%endif 9553 9554 pmaddubsw m4, m6 9555 pmaddubsw m5, m7 9556 phaddw m4, m5 9557 phaddsw m4, m4 9558 pmulhrsw m4, m10 ; x=(x+64)>>7 9559 packuswb m4, m4 9560 movd [dstq+xq], m4 9561 9562 paddd m0, m13 9563 add xd, 4 9564%if STACK_ALIGNMENT >= 16 9565 cmp xd, dst_wd 9566%else 9567 cmp xd, dst_wm 9568%endif 9569 jl .loop_x 9570 9571 add dstq, dst_stridemp 9572 add srcq, src_stridemp 9573 dec hd 9574 jg .loop_y 9575 RET 9576 9577INIT_XMM ssse3 9578PREP_BILIN 9579PREP_8TAP 9580WARP_AFFINE_8X8 9581WARP_AFFINE_8X8T 9582 9583INIT_XMM sse4 9584WARP_AFFINE_8X8 9585WARP_AFFINE_8X8T 9586 9587INIT_XMM sse2 9588PREP_BILIN 9589PREP_8TAP 9590WARP_AFFINE_8X8 9591WARP_AFFINE_8X8T 9592