1;***************************************************************************** 2;* x86-optimized functions for yadif filter 3;* Copyright (C) 2020 Vivia Nikolaidou <vivia.nikolaidou@ltnglobal.com> 4;* 5;* Based on libav's vf_yadif.asm file 6;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at> 7;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com> 8;* 9;* This file is part of FFmpeg. 10;* 11;* FFmpeg is free software; you can redistribute it and/or 12;* modify it under the terms of the GNU Lesser General Public 13;* License as published by the Free Software Foundation; either 14;* version 2.1 of the License, or (at your option) any later version. 15;* 16;* FFmpeg is distributed in the hope that it will be useful, 17;* but WITHOUT ANY WARRANTY; without even the implied warranty of 18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19;* Lesser General Public License for more details. 20;* 21;* You should have received a copy of the GNU Lesser General Public 22;* License along with FFmpeg; if not, write to the Free Software 23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24;****************************************************************************** 25 26%include "x86inc.asm" 27 28SECTION_RODATA 29 30; 16 bytes of value 1 31pb_1: times 16 db 1 32; 8 words of value 1 33pw_1: times 8 dw 1 34 35SECTION .text 36 37%macro ABS1 2 38%if cpuflag(ssse3) 39 pabsw %1, %1 40%elif cpuflag(mmxext) ; a, tmp 41 pxor %2, %2 42 psubw %2, %1 43 pmaxsw %1, %2 44%else ; a, tmp 45 pxor %2, %2 46 pcmpgtw %2, %1 47 pxor %1, %2 48 psubw %1, %2 49%endif 50%endmacro 51 52%macro CHECK 2 53; %1 = 1+j, %2 = 1-j 54 ; m2 = t0[x+1+j] 55 movu m2, [tzeroq+%1] 56 ; m3 = b0[x+1-j] 57 movu m3, [bzeroq+%2] 58 ; m4 = t0[x+1+j] 59 mova m4, m2 60 ; m5 = t0[x+1+j] 61 mova m5, m2 62 ; m4 = xor(t0[x+1+j], b0[x+1-j]) 63 pxor m4, m3 64 pavgb m5, m3 65 ; round down to 0 66 pand m4, [pb_1] 67 ; m5 = rounded down average of the whole thing 68 psubusb m5, m4 69 ; shift by 1 quadword to prepare for spatial_pred 70 psrldq m5, 1 71 ; m7 = 0 72 ; Interleave low-order bytes with 0 73 ; so one pixel doesn't spill into the next one 74 punpcklbw m5, m7 75 ; m4 = t0[x+1+j] (reset) 76 mova m4, m2 77 ; m2 = t0[x+1+j] - b0[x+1-j] 78 psubusb m2, m3 79 ; m3 = -m2 80 psubusb m3, m4 81 ; m2 = FFABS(t0[x+1+j] - b0[x+1-j]); 82 pmaxub m2, m3 83 ; m3 = FFABS(t0[x+1+j] - b0[x+1-j]); 84 mova m3, m2 85 ; m4 = FFABS(t0[x+1+j] - b0[x+1-j]); 86 mova m4, m2 87 ; m3 = FFABS(t0[x+j] - b0[x-j]) 88 psrldq m3, 1 89 ; m4 = FFABS(t0[x-1+j] - b0[x-1-j]) 90 psrldq m4, 2 91 ; prevent pixel spilling for all of them 92 punpcklbw m2, m7 93 punpcklbw m3, m7 94 punpcklbw m4, m7 95 paddw m2, m3 96 ; m2 = score 97 paddw m2, m4 98%endmacro 99 100%macro CHECK1 0 101; m0 was spatial_score 102; m1 was spatial_pred 103 mova m3, m0 104 ; compare for greater than 105 ; each word will be 1111 or 0000 106 pcmpgtw m3, m2 107 ; if (score < spatial_score) spatial_score = score; 108 pminsw m0, m2 109 ; m6 = the mask 110 mova m6, m3 111 ; m5 = becomes 0 if it should change 112 pand m5, m3 113 ; nand: m3 = becomes 0 if it should not change 114 pandn m3, m1 115 ; m3 = put them together in an OR 116 por m3, m5 117 ; and put it in spatial_pred 118 mova m1, m3 119%endmacro 120 121%macro CHECK2 0 122; m6 was the mask from CHECK1 (we don't change it) 123 paddw m6, [pw_1] 124 ; shift words left while shifting in 14 0s (16 - j) 125 ; essentially to not recalculate the mask! 126 psllw m6, 14 127 ; add it to score 128 paddsw m2, m6 129 ; same as CHECK1 130 mova m3, m0 131 pcmpgtw m3, m2 132 pminsw m0, m2 133 pand m5, m3 134 pandn m3, m1 135 por m3, m5 136 mova m1, m3 137%endmacro 138 139%macro LOAD 2 140 movh %1, %2 141 punpcklbw %1, m7 142%endmacro 143 144%macro FILTER_HEAD 0 145 ; m7 = 0 146 pxor m7, m7 147 ; m0 = c 148 LOAD m0, [tzeroq] 149 ; m1 = e 150 LOAD m1, [bzeroq] 151 ; m3 = mp 152 LOAD m3, [mpq] 153 ; m2 = m1 154 LOAD m2, [moneq] 155 ; m4 = mp 156 mova m4, m3 157 ; m3 = m1 + mp 158 paddw m3, m2 159 ; m3 = d 160 psraw m3, 1 161 ; rsp + 0 = d 162 mova [rsp+ 0], m3 163 ; rsp + 16 = bzeroq 164 mova [rsp+16], m1 165 ; m2 = m1 - mp 166 psubw m2, m4 167 ; m2 = temporal_diff0 (m4 is temporary) 168 ABS1 m2, m4 169 ; m3 = t2 170 LOAD m3, [ttwoq] 171 ; m4 = b2 172 LOAD m4, [btwoq] 173 ; m3 = t2 - c 174 psubw m3, m0 175 ; m4 = b2 - e 176 psubw m4, m1 177 ; m3 = ABS(t2 - c) 178 ABS1 m3, m5 179 ; m4 = ABS(b2 - e) 180 ABS1 m4, m5 181 paddw m3, m4 182 psrlw m2, 1 183 ; m3 = temporal_diff1 184 psrlw m3, 1 185 ; m2 = left part of diff 186 pmaxsw m2, m3 187 ; m3 = tp2 188 LOAD m3, [tptwoq] 189 ; m4 = bp2 190 LOAD m4, [bptwoq] 191 psubw m3, m0 192 psubw m4, m1 193 ABS1 m3, m5 194 ABS1 m4, m5 195 paddw m3, m4 196 ; m3 = temporal_diff2 197 psrlw m3, 1 198 ; m2 = diff (for real) 199 pmaxsw m2, m3 200 ; rsp + 32 = diff 201 mova [rsp+32], m2 202 203 ; m1 = e + c 204 paddw m1, m0 205 ; m0 = 2c 206 paddw m0, m0 207 ; m0 = c - e 208 psubw m0, m1 209 ; m1 = spatial_pred 210 psrlw m1, 1 211 ; m0 = FFABS(c-e) 212 ABS1 m0, m2 213 214 ; m2 = t0[x-1] 215 ; if it's unpacked it should contain 4 bytes 216 movu m2, [tzeroq-1] 217 ; m3 = b0[x-1] 218 movu m3, [bzeroq-1] 219 ; m4 = t0[x-1] 220 mova m4, m2 221 ; m2 = t0[x-1]-b0[x-1] unsigned packed 222 psubusb m2, m3 223 ; m3 = m3 - m4 = b0[x-1]-t0[x-1] = -m2 unsigned packed 224 psubusb m3, m4 225 ; m2 = max(m2, -m2) = abs(t0[x-1]-b0[x-1]) 226 pmaxub m2, m3 227%if mmsize == 16 228 ; m3 = m2 >> 2quadwords 229 ; pixel jump: go from x-1 to x+1 230 mova m3, m2 231 psrldq m3, 2 232%else 233 pshufw m3, m2, q0021 234%endif 235 ; m7 = 0 236 ; unpack and interleave low-order bytes 237 ; to prevent pixel spilling when adding 238 punpcklbw m2, m7 239 punpcklbw m3, m7 240 paddw m0, m2 241 paddw m0, m3 242 ; m0 = spatial_score 243 psubw m0, [pw_1] 244 245 CHECK -2, 0 246 CHECK1 247 CHECK -3, 1 248 CHECK2 249 CHECK 0, -2 250 CHECK1 251 CHECK 1, -3 252 CHECK2 253 ; now m0 = spatial_score, m1 = spatial_pred 254 255 ; m6 = diff 256 mova m6, [rsp+32] 257%endmacro 258 259%macro FILTER_TAIL 0 260 ; m2 = d 261 mova m2, [rsp] 262 ; m3 = d 263 mova m3, m2 264 ; m2 = d - diff 265 psubw m2, m6 266 ; m3 = d + diff 267 paddw m3, m6 268 ; m1 = max(spatial_pred, d-diff) 269 pmaxsw m1, m2 270 ; m1 = min(d + diff, max(spatial_pred, d-diff)) 271 ; m1 = spatial_pred 272 pminsw m1, m3 273 ; Converts 8 signed word integers into 16 unsigned byte integers with saturation 274 packuswb m1, m1 275 276 ; dst = spatial_pred 277 movh [dstq], m1 278 ; half the register size 279 add dstq, mmsize/2 280 add tzeroq, mmsize/2 281 add bzeroq, mmsize/2 282 add moneq, mmsize/2 283 add mpq, mmsize/2 284 add ttwoq, mmsize/2 285 add btwoq, mmsize/2 286 add tptwoq, mmsize/2 287 add bptwoq, mmsize/2 288 add ttoneq, mmsize/2 289 add ttpq, mmsize/2 290 add bboneq, mmsize/2 291 add bbpq, mmsize/2 292%endmacro 293 294%macro FILTER_MODE0 0 295.loop0: 296 FILTER_HEAD 297 ; m2 = tt1 298 LOAD m2, [ttoneq] 299 ; m4 = ttp 300 LOAD m4, [ttpq] 301 ; m3 = bb1 302 LOAD m3, [bboneq] 303 ; m5 = bbp 304 LOAD m5, [bbpq] 305 paddw m2, m4 306 paddw m3, m5 307 ; m2 = b 308 psrlw m2, 1 309 ; m3 = f 310 psrlw m3, 1 311 ; m4 = c 312 LOAD m4, [tzeroq] 313 ; m5 = d 314 mova m5, [rsp] 315 ; m7 = e 316 mova m7, [rsp+16] 317 ; m2 = b - c 318 psubw m2, m4 319 ; m3 = f - e 320 psubw m3, m7 321 ; m0 = d 322 mova m0, m5 323 ; m5 = d - c 324 psubw m5, m4 325 ; m0 = d - e 326 psubw m0, m7 327 ; m4 = b - c 328 mova m4, m2 329 ; m2 = FFMIN(b-c, f-e) 330 pminsw m2, m3 331 ; m3 = FFMAX(f-e, b-c) 332 pmaxsw m3, m4 333 ; m2 = FFMAX(d-c, FFMIN(b-c, f-e)) 334 pmaxsw m2, m5 335 ; m3 = FFMIN(d-c, FFMAX(f-e, b-c)) 336 pminsw m3, m5 337 ; m2 = max 338 pmaxsw m2, m0 339 ; m3 = min 340 pminsw m3, m0 341 ; m4 = 0 342 pxor m4, m4 343 ; m6 = MAX(diff, min) 344 pmaxsw m6, m3 345 ; m4 = -max 346 psubw m4, m2 347 ; m6 = diff 348 pmaxsw m6, m4 349 350 FILTER_TAIL 351 ; r13m = w 352 sub DWORD r13m, mmsize/2 353 jg .loop0 354%endmacro 355 356%macro FILTER_MODE2 0 357.loop2: 358 FILTER_HEAD 359 FILTER_TAIL 360 ; r13m = w 361 sub DWORD r13m, mmsize/2 362 jg .loop2 363%endmacro 364 365%macro YADIF_ADD3 0 366 ; start 3 pixels later 367 add dstq, 3 368 add tzeroq, 3 369 add bzeroq, 3 370 add moneq, 3 371 add mpq, 3 372 add ttwoq, 3 373 add btwoq, 3 374 add tptwoq, 3 375 add bptwoq, 3 376 add ttoneq, 3 377 add ttpq, 3 378 add bboneq, 3 379 add bbpq, 3 380%endmacro 381 382; cglobal foo, 2,3,7,0x40, dst, src, tmp 383; declares a function (foo) that automatically loads two arguments (dst and 384; src) into registers, uses one additional register (tmp) plus 7 vector 385; registers (m0-m6) and allocates 0x40 bytes of stack space. 386%macro YADIF_MODE0 0 387cglobal yadif_filter_line_mode0, 13, 14, 8, 80, dst, tzero, bzero, mone, mp, \ 388 ttwo, btwo, tptwo, bptwo, ttone, \ 389 ttp, bbone, bbp, w 390 391 YADIF_ADD3 392 FILTER_MODE0 393 RET 394%endmacro 395 396%macro YADIF_MODE2 0 397cglobal yadif_filter_line_mode2, 13, 14, 8, 80, dst, tzero, bzero, mone, mp, \ 398 ttwo, btwo, tptwo, bptwo, ttone, \ 399 ttp, bbone, bbp, w 400 401 YADIF_ADD3 402 FILTER_MODE2 403 RET 404%endmacro 405 406; declares two functions for ssse3, and two for sse2 407INIT_XMM ssse3 408YADIF_MODE0 409YADIF_MODE2 410INIT_XMM sse2 411YADIF_MODE0 412YADIF_MODE2 413