1;****************************************************************************** 2;* VP9 motion compensation SIMD optimizations 3;* 4;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 32 26 27cextern pw_256 28cextern pw_64 29 30%macro F8_SSSE3_TAPS 8 31times 16 db %1, %2 32times 16 db %3, %4 33times 16 db %5, %6 34times 16 db %7, %8 35%endmacro 36 37%macro F8_SSE2_TAPS 8 38times 8 dw %1 39times 8 dw %2 40times 8 dw %3 41times 8 dw %4 42times 8 dw %5 43times 8 dw %6 44times 8 dw %7 45times 8 dw %8 46%endmacro 47 48%macro F8_16BPP_TAPS 8 49times 8 dw %1, %2 50times 8 dw %3, %4 51times 8 dw %5, %6 52times 8 dw %7, %8 53%endmacro 54 55%macro FILTER 1 56const filters_%1 ; smooth 57 F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0 58 F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0 59 F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0 60 F8_TAPS -2, -3, 24, 62, 46, 5, -4, 0 61 F8_TAPS -2, -3, 21, 60, 49, 7, -4, 0 62 F8_TAPS -1, -4, 18, 59, 51, 9, -4, 0 63 F8_TAPS -1, -4, 16, 57, 53, 12, -4, -1 64 F8_TAPS -1, -4, 14, 55, 55, 14, -4, -1 65 F8_TAPS -1, -4, 12, 53, 57, 16, -4, -1 66 F8_TAPS 0, -4, 9, 51, 59, 18, -4, -1 67 F8_TAPS 0, -4, 7, 49, 60, 21, -3, -2 68 F8_TAPS 0, -4, 5, 46, 62, 24, -3, -2 69 F8_TAPS 0, -4, 4, 43, 63, 26, -2, -2 70 F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2 71 F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3 72 ; regular 73 F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0 74 F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0 75 F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1 76 F8_TAPS -1, 4, -16, 112, 37, -11, 4, -1 77 F8_TAPS -1, 5, -18, 105, 48, -14, 4, -1 78 F8_TAPS -1, 5, -19, 97, 58, -16, 5, -1 79 F8_TAPS -1, 6, -19, 88, 68, -18, 5, -1 80 F8_TAPS -1, 6, -19, 78, 78, -19, 6, -1 81 F8_TAPS -1, 5, -18, 68, 88, -19, 6, -1 82 F8_TAPS -1, 5, -16, 58, 97, -19, 5, -1 83 F8_TAPS -1, 4, -14, 48, 105, -18, 5, -1 84 F8_TAPS -1, 4, -11, 37, 112, -16, 4, -1 85 F8_TAPS -1, 3, -9, 27, 118, -13, 4, -1 86 F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1 87 F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0 88 ; sharp 89 F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0 90 F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1 91 F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2 92 F8_TAPS -4, 9, -20, 115, 37, -13, 6, -2 93 F8_TAPS -4, 10, -23, 108, 48, -16, 8, -3 94 F8_TAPS -4, 10, -24, 100, 59, -19, 9, -3 95 F8_TAPS -4, 11, -24, 90, 70, -21, 10, -4 96 F8_TAPS -4, 11, -23, 80, 80, -23, 11, -4 97 F8_TAPS -4, 10, -21, 70, 90, -24, 11, -4 98 F8_TAPS -3, 9, -19, 59, 100, -24, 10, -4 99 F8_TAPS -3, 8, -16, 48, 108, -23, 10, -4 100 F8_TAPS -2, 6, -13, 37, 115, -20, 9, -4 101 F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3 102 F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2 103 F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1 104%endmacro 105 106%define F8_TAPS F8_SSSE3_TAPS 107; int8_t ff_filters_ssse3[3][15][4][32] 108FILTER ssse3 109%define F8_TAPS F8_SSE2_TAPS 110; int16_t ff_filters_sse2[3][15][8][8] 111FILTER sse2 112%define F8_TAPS F8_16BPP_TAPS 113; int16_t ff_filters_16bpp[3][15][4][16] 114FILTER 16bpp 115 116SECTION .text 117 118%macro filter_sse2_h_fn 1 119%assign %%px mmsize/2 120cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h, filtery 121 pxor m5, m5 122 mova m6, [pw_64] 123 mova m7, [filteryq+ 0] 124%if ARCH_X86_64 && mmsize > 8 125 mova m8, [filteryq+ 16] 126 mova m9, [filteryq+ 32] 127 mova m10, [filteryq+ 48] 128 mova m11, [filteryq+ 64] 129 mova m12, [filteryq+ 80] 130 mova m13, [filteryq+ 96] 131 mova m14, [filteryq+112] 132%endif 133.loop: 134 movh m0, [srcq-3] 135 movh m1, [srcq-2] 136 movh m2, [srcq-1] 137 movh m3, [srcq+0] 138 movh m4, [srcq+1] 139 punpcklbw m0, m5 140 punpcklbw m1, m5 141 punpcklbw m2, m5 142 punpcklbw m3, m5 143 punpcklbw m4, m5 144 pmullw m0, m7 145%if ARCH_X86_64 && mmsize > 8 146 pmullw m1, m8 147 pmullw m2, m9 148 pmullw m3, m10 149 pmullw m4, m11 150%else 151 pmullw m1, [filteryq+ 16] 152 pmullw m2, [filteryq+ 32] 153 pmullw m3, [filteryq+ 48] 154 pmullw m4, [filteryq+ 64] 155%endif 156 paddw m0, m1 157 paddw m2, m3 158 paddw m0, m4 159 movh m1, [srcq+2] 160 movh m3, [srcq+3] 161 movh m4, [srcq+4] 162 add srcq, sstrideq 163 punpcklbw m1, m5 164 punpcklbw m3, m5 165 punpcklbw m4, m5 166%if ARCH_X86_64 && mmsize > 8 167 pmullw m1, m12 168 pmullw m3, m13 169 pmullw m4, m14 170%else 171 pmullw m1, [filteryq+ 80] 172 pmullw m3, [filteryq+ 96] 173 pmullw m4, [filteryq+112] 174%endif 175 paddw m0, m1 176 paddw m3, m4 177 paddw m0, m6 178 paddw m2, m3 179 paddsw m0, m2 180 psraw m0, 7 181%ifidn %1, avg 182 movh m1, [dstq] 183%endif 184 packuswb m0, m0 185%ifidn %1, avg 186 pavgb m0, m1 187%endif 188 movh [dstq], m0 189 add dstq, dstrideq 190 dec hd 191 jg .loop 192 RET 193%endmacro 194 195INIT_MMX mmxext 196filter_sse2_h_fn put 197filter_sse2_h_fn avg 198 199INIT_XMM sse2 200filter_sse2_h_fn put 201filter_sse2_h_fn avg 202 203%macro filter_h_fn 1 204%assign %%px mmsize/2 205cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery 206 mova m6, [pw_256] 207 mova m7, [filteryq+ 0] 208%if ARCH_X86_64 && mmsize > 8 209 mova m8, [filteryq+32] 210 mova m9, [filteryq+64] 211 mova m10, [filteryq+96] 212%endif 213.loop: 214 movh m0, [srcq-3] 215 movh m1, [srcq-2] 216 movh m2, [srcq-1] 217 movh m3, [srcq+0] 218 movh m4, [srcq+1] 219 movh m5, [srcq+2] 220 punpcklbw m0, m1 221 punpcklbw m2, m3 222 movh m1, [srcq+3] 223 movh m3, [srcq+4] 224 add srcq, sstrideq 225 punpcklbw m4, m5 226 punpcklbw m1, m3 227 pmaddubsw m0, m7 228%if ARCH_X86_64 && mmsize > 8 229 pmaddubsw m2, m8 230 pmaddubsw m4, m9 231 pmaddubsw m1, m10 232%else 233 pmaddubsw m2, [filteryq+32] 234 pmaddubsw m4, [filteryq+64] 235 pmaddubsw m1, [filteryq+96] 236%endif 237 paddw m0, m4 238 paddw m2, m1 239 paddsw m0, m2 240 pmulhrsw m0, m6 241%ifidn %1, avg 242 movh m1, [dstq] 243%endif 244 packuswb m0, m0 245%ifidn %1, avg 246 pavgb m0, m1 247%endif 248 movh [dstq], m0 249 add dstq, dstrideq 250 dec hd 251 jg .loop 252 RET 253%endmacro 254 255INIT_MMX ssse3 256filter_h_fn put 257filter_h_fn avg 258 259INIT_XMM ssse3 260filter_h_fn put 261filter_h_fn avg 262 263%if ARCH_X86_64 264%macro filter_hx2_fn 1 265%assign %%px mmsize 266cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 14, dst, dstride, src, sstride, h, filtery 267 mova m13, [pw_256] 268 mova m8, [filteryq+ 0] 269 mova m9, [filteryq+32] 270 mova m10, [filteryq+64] 271 mova m11, [filteryq+96] 272.loop: 273 movu m0, [srcq-3] 274 movu m1, [srcq-2] 275 movu m2, [srcq-1] 276 movu m3, [srcq+0] 277 movu m4, [srcq+1] 278 movu m5, [srcq+2] 279 movu m6, [srcq+3] 280 movu m7, [srcq+4] 281 add srcq, sstrideq 282 SBUTTERFLY bw, 0, 1, 12 283 SBUTTERFLY bw, 2, 3, 12 284 SBUTTERFLY bw, 4, 5, 12 285 SBUTTERFLY bw, 6, 7, 12 286 pmaddubsw m0, m8 287 pmaddubsw m1, m8 288 pmaddubsw m2, m9 289 pmaddubsw m3, m9 290 pmaddubsw m4, m10 291 pmaddubsw m5, m10 292 pmaddubsw m6, m11 293 pmaddubsw m7, m11 294 paddw m0, m4 295 paddw m1, m5 296 paddw m2, m6 297 paddw m3, m7 298 paddsw m0, m2 299 paddsw m1, m3 300 pmulhrsw m0, m13 301 pmulhrsw m1, m13 302 packuswb m0, m1 303%ifidn %1, avg 304 pavgb m0, [dstq] 305%endif 306 mova [dstq], m0 307 add dstq, dstrideq 308 dec hd 309 jg .loop 310 RET 311%endmacro 312 313INIT_XMM ssse3 314filter_hx2_fn put 315filter_hx2_fn avg 316 317%if HAVE_AVX2_EXTERNAL 318INIT_YMM avx2 319filter_hx2_fn put 320filter_hx2_fn avg 321%endif 322 323%endif ; ARCH_X86_64 324 325%macro filter_sse2_v_fn 1 326%assign %%px mmsize/2 327%if ARCH_X86_64 328cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3 329%else 330cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3 331 mov filteryq, r5mp 332%define hd r4mp 333%endif 334 pxor m5, m5 335 mova m6, [pw_64] 336 lea sstride3q, [sstrideq*3] 337 lea src4q, [srcq+sstrideq] 338 sub srcq, sstride3q 339 mova m7, [filteryq+ 0] 340%if ARCH_X86_64 && mmsize > 8 341 mova m8, [filteryq+ 16] 342 mova m9, [filteryq+ 32] 343 mova m10, [filteryq+ 48] 344 mova m11, [filteryq+ 64] 345 mova m12, [filteryq+ 80] 346 mova m13, [filteryq+ 96] 347 mova m14, [filteryq+112] 348%endif 349.loop: 350 ; FIXME maybe reuse loads from previous rows, or just 351 ; more generally unroll this to prevent multiple loads of 352 ; the same data? 353 movh m0, [srcq] 354 movh m1, [srcq+sstrideq] 355 movh m2, [srcq+sstrideq*2] 356 movh m3, [srcq+sstride3q] 357 add srcq, sstrideq 358 movh m4, [src4q] 359 punpcklbw m0, m5 360 punpcklbw m1, m5 361 punpcklbw m2, m5 362 punpcklbw m3, m5 363 punpcklbw m4, m5 364 pmullw m0, m7 365%if ARCH_X86_64 && mmsize > 8 366 pmullw m1, m8 367 pmullw m2, m9 368 pmullw m3, m10 369 pmullw m4, m11 370%else 371 pmullw m1, [filteryq+ 16] 372 pmullw m2, [filteryq+ 32] 373 pmullw m3, [filteryq+ 48] 374 pmullw m4, [filteryq+ 64] 375%endif 376 paddw m0, m1 377 paddw m2, m3 378 paddw m0, m4 379 movh m1, [src4q+sstrideq] 380 movh m3, [src4q+sstrideq*2] 381 movh m4, [src4q+sstride3q] 382 add src4q, sstrideq 383 punpcklbw m1, m5 384 punpcklbw m3, m5 385 punpcklbw m4, m5 386%if ARCH_X86_64 && mmsize > 8 387 pmullw m1, m12 388 pmullw m3, m13 389 pmullw m4, m14 390%else 391 pmullw m1, [filteryq+ 80] 392 pmullw m3, [filteryq+ 96] 393 pmullw m4, [filteryq+112] 394%endif 395 paddw m0, m1 396 paddw m3, m4 397 paddw m0, m6 398 paddw m2, m3 399 paddsw m0, m2 400 psraw m0, 7 401%ifidn %1, avg 402 movh m1, [dstq] 403%endif 404 packuswb m0, m0 405%ifidn %1, avg 406 pavgb m0, m1 407%endif 408 movh [dstq], m0 409 add dstq, dstrideq 410 dec hd 411 jg .loop 412 RET 413%endmacro 414 415INIT_MMX mmxext 416filter_sse2_v_fn put 417filter_sse2_v_fn avg 418 419INIT_XMM sse2 420filter_sse2_v_fn put 421filter_sse2_v_fn avg 422 423%macro filter_v_fn 1 424%assign %%px mmsize/2 425%if ARCH_X86_64 426cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3 427%else 428cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3 429 mov filteryq, r5mp 430%define hd r4mp 431%endif 432 mova m6, [pw_256] 433 lea sstride3q, [sstrideq*3] 434 lea src4q, [srcq+sstrideq] 435 sub srcq, sstride3q 436 mova m7, [filteryq+ 0] 437%if ARCH_X86_64 && mmsize > 8 438 mova m8, [filteryq+32] 439 mova m9, [filteryq+64] 440 mova m10, [filteryq+96] 441%endif 442.loop: 443 ; FIXME maybe reuse loads from previous rows, or just more generally 444 ; unroll this to prevent multiple loads of the same data? 445 movh m0, [srcq] 446 movh m1, [srcq+sstrideq] 447 movh m2, [srcq+sstrideq*2] 448 movh m3, [srcq+sstride3q] 449 movh m4, [src4q] 450 movh m5, [src4q+sstrideq] 451 punpcklbw m0, m1 452 punpcklbw m2, m3 453 movh m1, [src4q+sstrideq*2] 454 movh m3, [src4q+sstride3q] 455 add srcq, sstrideq 456 add src4q, sstrideq 457 punpcklbw m4, m5 458 punpcklbw m1, m3 459 pmaddubsw m0, m7 460%if ARCH_X86_64 && mmsize > 8 461 pmaddubsw m2, m8 462 pmaddubsw m4, m9 463 pmaddubsw m1, m10 464%else 465 pmaddubsw m2, [filteryq+32] 466 pmaddubsw m4, [filteryq+64] 467 pmaddubsw m1, [filteryq+96] 468%endif 469 paddw m0, m4 470 paddw m2, m1 471 paddsw m0, m2 472 pmulhrsw m0, m6 473%ifidn %1, avg 474 movh m1, [dstq] 475%endif 476 packuswb m0, m0 477%ifidn %1, avg 478 pavgb m0, m1 479%endif 480 movh [dstq], m0 481 add dstq, dstrideq 482 dec hd 483 jg .loop 484 RET 485%endmacro 486 487INIT_MMX ssse3 488filter_v_fn put 489filter_v_fn avg 490 491INIT_XMM ssse3 492filter_v_fn put 493filter_v_fn avg 494 495%if ARCH_X86_64 496 497%macro filter_vx2_fn 1 498%assign %%px mmsize 499cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3 500 mova m13, [pw_256] 501 lea sstride3q, [sstrideq*3] 502 lea src4q, [srcq+sstrideq] 503 sub srcq, sstride3q 504 mova m8, [filteryq+ 0] 505 mova m9, [filteryq+32] 506 mova m10, [filteryq+64] 507 mova m11, [filteryq+96] 508.loop: 509 ; FIXME maybe reuse loads from previous rows, or just 510 ; more generally unroll this to prevent multiple loads of 511 ; the same data? 512 movu m0, [srcq] 513 movu m1, [srcq+sstrideq] 514 movu m2, [srcq+sstrideq*2] 515 movu m3, [srcq+sstride3q] 516 movu m4, [src4q] 517 movu m5, [src4q+sstrideq] 518 movu m6, [src4q+sstrideq*2] 519 movu m7, [src4q+sstride3q] 520 add srcq, sstrideq 521 add src4q, sstrideq 522 SBUTTERFLY bw, 0, 1, 12 523 SBUTTERFLY bw, 2, 3, 12 524 SBUTTERFLY bw, 4, 5, 12 525 SBUTTERFLY bw, 6, 7, 12 526 pmaddubsw m0, m8 527 pmaddubsw m1, m8 528 pmaddubsw m2, m9 529 pmaddubsw m3, m9 530 pmaddubsw m4, m10 531 pmaddubsw m5, m10 532 pmaddubsw m6, m11 533 pmaddubsw m7, m11 534 paddw m0, m4 535 paddw m1, m5 536 paddw m2, m6 537 paddw m3, m7 538 paddsw m0, m2 539 paddsw m1, m3 540 pmulhrsw m0, m13 541 pmulhrsw m1, m13 542 packuswb m0, m1 543%ifidn %1, avg 544 pavgb m0, [dstq] 545%endif 546 mova [dstq], m0 547 add dstq, dstrideq 548 dec hd 549 jg .loop 550 RET 551%endmacro 552 553INIT_XMM ssse3 554filter_vx2_fn put 555filter_vx2_fn avg 556 557%if HAVE_AVX2_EXTERNAL 558INIT_YMM avx2 559filter_vx2_fn put 560filter_vx2_fn avg 561%endif 562 563%endif ; ARCH_X86_64 564 565%macro fpel_fn 6-8 0, 4 566%if %2 == 4 567%define %%srcfn movh 568%define %%dstfn movh 569%else 570%define %%srcfn movu 571%define %%dstfn mova 572%endif 573 574%if %7 == 8 575%define %%pavg pavgb 576%define %%szsuf _8 577%elif %7 == 16 578%define %%pavg pavgw 579%define %%szsuf _16 580%else 581%define %%szsuf 582%endif 583 584%if %2 <= mmsize 585cglobal vp9_%1%2 %+ %%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 586 lea sstride3q, [sstrideq*3] 587 lea dstride3q, [dstrideq*3] 588%else 589cglobal vp9_%1%2 %+ %%szsuf, 5, 5, %8, dst, dstride, src, sstride, h 590%endif 591.loop: 592 %%srcfn m0, [srcq] 593 %%srcfn m1, [srcq+s%3] 594 %%srcfn m2, [srcq+s%4] 595 %%srcfn m3, [srcq+s%5] 596%if %2/mmsize == 8 597 %%srcfn m4, [srcq+mmsize*4] 598 %%srcfn m5, [srcq+mmsize*5] 599 %%srcfn m6, [srcq+mmsize*6] 600 %%srcfn m7, [srcq+mmsize*7] 601%endif 602 lea srcq, [srcq+sstrideq*%6] 603%ifidn %1, avg 604 %%pavg m0, [dstq] 605 %%pavg m1, [dstq+d%3] 606 %%pavg m2, [dstq+d%4] 607 %%pavg m3, [dstq+d%5] 608%if %2/mmsize == 8 609 %%pavg m4, [dstq+mmsize*4] 610 %%pavg m5, [dstq+mmsize*5] 611 %%pavg m6, [dstq+mmsize*6] 612 %%pavg m7, [dstq+mmsize*7] 613%endif 614%endif 615 %%dstfn [dstq], m0 616 %%dstfn [dstq+d%3], m1 617 %%dstfn [dstq+d%4], m2 618 %%dstfn [dstq+d%5], m3 619%if %2/mmsize == 8 620 %%dstfn [dstq+mmsize*4], m4 621 %%dstfn [dstq+mmsize*5], m5 622 %%dstfn [dstq+mmsize*6], m6 623 %%dstfn [dstq+mmsize*7], m7 624%endif 625 lea dstq, [dstq+dstrideq*%6] 626 sub hd, %6 627 jnz .loop 628 RET 629%endmacro 630 631%define d16 16 632%define s16 16 633%define d32 32 634%define s32 32 635INIT_MMX mmx 636fpel_fn put, 4, strideq, strideq*2, stride3q, 4 637fpel_fn put, 8, strideq, strideq*2, stride3q, 4 638INIT_MMX mmxext 639fpel_fn avg, 4, strideq, strideq*2, stride3q, 4, 8 640fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 8 641INIT_XMM sse 642fpel_fn put, 16, strideq, strideq*2, stride3q, 4 643fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2 644fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1 645fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1, 0, 8 646INIT_XMM sse2 647fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8 648fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 8 649fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 8 650INIT_YMM avx 651fpel_fn put, 32, strideq, strideq*2, stride3q, 4 652fpel_fn put, 64, mmsize, strideq, strideq+mmsize, 2 653fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1 654%if HAVE_AVX2_EXTERNAL 655INIT_YMM avx2 656fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8 657fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 8 658%endif 659INIT_MMX mmxext 660fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 16 661INIT_XMM sse2 662fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 16 663fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 16 664fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 16 665fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16, 8 666%if HAVE_AVX2_EXTERNAL 667INIT_YMM avx2 668fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 16 669fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 16 670fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16 671%endif 672%undef s16 673%undef d16 674%undef s32 675%undef d32 676