1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14pw_8: times 8 dw 8 15bilin_filter_m_sse2: times 8 dw 16 16 times 8 dw 0 17 times 8 dw 14 18 times 8 dw 2 19 times 8 dw 12 20 times 8 dw 4 21 times 8 dw 10 22 times 8 dw 6 23 times 16 dw 8 24 times 8 dw 6 25 times 8 dw 10 26 times 8 dw 4 27 times 8 dw 12 28 times 8 dw 2 29 times 8 dw 14 30 31bilin_filter_m_ssse3: times 8 db 16, 0 32 times 8 db 14, 2 33 times 8 db 12, 4 34 times 8 db 10, 6 35 times 16 db 8 36 times 8 db 6, 10 37 times 8 db 4, 12 38 times 8 db 2, 14 39 40SECTION .text 41 42; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, 43; int x_offset, int y_offset, 44; const uint8_t *dst, ptrdiff_t dst_stride, 45; int height, unsigned int *sse); 46; 47; This function returns the SE and stores SSE in the given pointer. 48 49%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse 50 psubw %3, %4 51 psubw %1, %2 52 paddw %5, %3 53 pmaddwd %3, %3 54 paddw %5, %1 55 pmaddwd %1, %1 56 paddd %6, %3 57 paddd %6, %1 58%endmacro 59 60%macro STORE_AND_RET 0 61%if mmsize == 16 62 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit 63 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. 64 ; We have to sign-extend it before adding the words within the register 65 ; and outputing to a dword. 66 pcmpgtw m5, m6 ; mask for 0 > x 67 movhlps m3, m7 68 punpcklwd m4, m6, m5 69 punpckhwd m6, m5 ; sign-extend m6 word->dword 70 paddd m7, m3 71 paddd m6, m4 72 pshufd m3, m7, 0x1 73 movhlps m4, m6 74 paddd m7, m3 75 paddd m6, m4 76 mov r1, ssem ; r1 = unsigned int *sse 77 pshufd m4, m6, 0x1 78 movd [r1], m7 ; store sse 79 paddd m6, m4 80 movd raxd, m6 ; store sum as return value 81%else ; mmsize == 8 82 pshufw m4, m6, 0xe 83 pshufw m3, m7, 0xe 84 paddw m6, m4 85 paddd m7, m3 86 pcmpgtw m5, m6 ; mask for 0 > x 87 mov r1, ssem ; r1 = unsigned int *sse 88 punpcklwd m6, m5 ; sign-extend m6 word->dword 89 movd [r1], m7 ; store sse 90 pshufw m4, m6, 0xe 91 paddd m6, m4 92 movd raxd, m6 ; store sum as return value 93%endif 94 RET 95%endmacro 96 97%macro INC_SRC_BY_SRC_STRIDE 0 98%if ARCH_X86=1 && CONFIG_PIC=1 99 add srcq, src_stridemp 100%else 101 add srcq, src_strideq 102%endif 103%endmacro 104 105%macro SUBPEL_VARIANCE 1-2 0 ; W 106%if cpuflag(ssse3) 107%define bilin_filter_m bilin_filter_m_ssse3 108%define filter_idx_shift 4 109%else 110%define bilin_filter_m bilin_filter_m_sse2 111%define filter_idx_shift 5 112%endif 113; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses 114; 11, not 13, if the registers are ordered correctly. May make a minor speed 115; difference on Win64 116 117%ifdef PIC ; 64bit PIC 118 %if %2 == 1 ; avg 119 cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ 120 x_offset, y_offset, \ 121 dst, dst_stride, \ 122 sec, sec_stride, height, sse 123 %define sec_str sec_strideq 124 %else 125 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ 126 y_offset, dst, dst_stride, height, sse 127 %endif 128 %define block_height heightd 129 %define bilin_filter sseq 130%else 131 %if ARCH_X86=1 && CONFIG_PIC=1 132 %if %2 == 1 ; avg 133 cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 134 x_offset, y_offset, \ 135 dst, dst_stride, \ 136 sec, sec_stride, \ 137 height, sse, g_bilin_filter, g_pw_8 138 %define block_height dword heightm 139 %define sec_str sec_stridemp 140 141 ;Store bilin_filter and pw_8 location in stack 142 GET_GOT eax 143 add esp, 4 ; restore esp 144 145 lea ecx, [GLOBAL(bilin_filter_m)] 146 mov g_bilin_filterm, ecx 147 148 lea ecx, [GLOBAL(pw_8)] 149 mov g_pw_8m, ecx 150 151 LOAD_IF_USED 0, 1 ; load eax, ecx back 152 %else 153 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ 154 y_offset, dst, dst_stride, height, sse, \ 155 g_bilin_filter, g_pw_8 156 %define block_height heightd 157 158 ;Store bilin_filter and pw_8 location in stack 159 GET_GOT eax 160 add esp, 4 ; restore esp 161 162 lea ecx, [GLOBAL(bilin_filter_m)] 163 mov g_bilin_filterm, ecx 164 165 lea ecx, [GLOBAL(pw_8)] 166 mov g_pw_8m, ecx 167 168 LOAD_IF_USED 0, 1 ; load eax, ecx back 169 %endif 170 %else 171 %if %2 == 1 ; avg 172 cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ 173 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ 174 x_offset, y_offset, \ 175 dst, dst_stride, \ 176 sec, sec_stride, \ 177 height, sse 178 %if ARCH_X86_64 179 %define block_height heightd 180 %define sec_str sec_strideq 181 %else 182 %define block_height dword heightm 183 %define sec_str sec_stridemp 184 %endif 185 %else 186 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ 187 y_offset, dst, dst_stride, height, sse 188 %define block_height heightd 189 %endif 190 191 %define bilin_filter bilin_filter_m 192 %endif 193%endif 194 195 ASSERT %1 <= 16 ; m6 overflows if w > 16 196 pxor m6, m6 ; sum 197 pxor m7, m7 ; sse 198 ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we 199 ; could perhaps use it for something more productive then 200 pxor m5, m5 ; dedicated zero register 201%if %1 < 16 202 sar block_height, 1 203%if %2 == 1 ; avg 204 shl sec_str, 1 205%endif 206%endif 207 208 ; FIXME(rbultje) replace by jumptable? 209 test x_offsetd, x_offsetd 210 jnz .x_nonzero 211 ; x_offset == 0 212 test y_offsetd, y_offsetd 213 jnz .x_zero_y_nonzero 214 215 ; x_offset == 0 && y_offset == 0 216.x_zero_y_zero_loop: 217%if %1 == 16 218 movu m0, [srcq] 219 mova m1, [dstq] 220%if %2 == 1 ; avg 221 pavgb m0, [secq] 222 punpckhbw m3, m1, m5 223 punpcklbw m1, m5 224%endif 225 punpckhbw m2, m0, m5 226 punpcklbw m0, m5 227%if %2 == 0 ; !avg 228 punpckhbw m3, m1, m5 229 punpcklbw m1, m5 230%endif 231 SUM_SSE m0, m1, m2, m3, m6, m7 232 233 add srcq, src_strideq 234 add dstq, dst_strideq 235%else ; %1 < 16 236 movh m0, [srcq] 237%if %2 == 1 ; avg 238%if mmsize == 16 239 movhps m0, [srcq+src_strideq] 240%else ; mmsize == 8 241 punpckldq m0, [srcq+src_strideq] 242%endif 243%else ; !avg 244 movh m2, [srcq+src_strideq] 245%endif 246 movh m1, [dstq] 247 movh m3, [dstq+dst_strideq] 248%if %2 == 1 ; avg 249 pavgb m0, [secq] 250 punpcklbw m3, m5 251 punpcklbw m1, m5 252 punpckhbw m2, m0, m5 253 punpcklbw m0, m5 254%else ; !avg 255 punpcklbw m0, m5 256 punpcklbw m2, m5 257 punpcklbw m3, m5 258 punpcklbw m1, m5 259%endif 260 SUM_SSE m0, m1, m2, m3, m6, m7 261 262 lea srcq, [srcq+src_strideq*2] 263 lea dstq, [dstq+dst_strideq*2] 264%endif 265%if %2 == 1 ; avg 266 add secq, sec_str 267%endif 268 dec block_height 269 jg .x_zero_y_zero_loop 270 STORE_AND_RET 271 272.x_zero_y_nonzero: 273 cmp y_offsetd, 8 274 jne .x_zero_y_nonhalf 275 276 ; x_offset == 0 && y_offset == 0.5 277.x_zero_y_half_loop: 278%if %1 == 16 279 movu m0, [srcq] 280 movu m4, [srcq+src_strideq] 281 mova m1, [dstq] 282 pavgb m0, m4 283 punpckhbw m3, m1, m5 284%if %2 == 1 ; avg 285 pavgb m0, [secq] 286%endif 287 punpcklbw m1, m5 288 punpckhbw m2, m0, m5 289 punpcklbw m0, m5 290 SUM_SSE m0, m1, m2, m3, m6, m7 291 292 add srcq, src_strideq 293 add dstq, dst_strideq 294%else ; %1 < 16 295 movh m0, [srcq] 296 movh m2, [srcq+src_strideq] 297%if %2 == 1 ; avg 298%if mmsize == 16 299 movhps m2, [srcq+src_strideq*2] 300%else ; mmsize == 8 301%if %1 == 4 302 movh m1, [srcq+src_strideq*2] 303 punpckldq m2, m1 304%else 305 punpckldq m2, [srcq+src_strideq*2] 306%endif 307%endif 308 movh m1, [dstq] 309%if mmsize == 16 310 movlhps m0, m2 311%else ; mmsize == 8 312 punpckldq m0, m2 313%endif 314 movh m3, [dstq+dst_strideq] 315 pavgb m0, m2 316 punpcklbw m1, m5 317 pavgb m0, [secq] 318 punpcklbw m3, m5 319 punpckhbw m2, m0, m5 320 punpcklbw m0, m5 321%else ; !avg 322 movh m4, [srcq+src_strideq*2] 323 movh m1, [dstq] 324 pavgb m0, m2 325 movh m3, [dstq+dst_strideq] 326 pavgb m2, m4 327 punpcklbw m0, m5 328 punpcklbw m2, m5 329 punpcklbw m3, m5 330 punpcklbw m1, m5 331%endif 332 SUM_SSE m0, m1, m2, m3, m6, m7 333 334 lea srcq, [srcq+src_strideq*2] 335 lea dstq, [dstq+dst_strideq*2] 336%endif 337%if %2 == 1 ; avg 338 add secq, sec_str 339%endif 340 dec block_height 341 jg .x_zero_y_half_loop 342 STORE_AND_RET 343 344.x_zero_y_nonhalf: 345 ; x_offset == 0 && y_offset == bilin interpolation 346%ifdef PIC 347 lea bilin_filter, [bilin_filter_m] 348%endif 349 shl y_offsetd, filter_idx_shift 350%if ARCH_X86_64 && mmsize == 16 351 mova m8, [bilin_filter+y_offsetq] 352%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 353 mova m9, [bilin_filter+y_offsetq+16] 354%endif 355 mova m10, [pw_8] 356%define filter_y_a m8 357%define filter_y_b m9 358%define filter_rnd m10 359%else ; x86-32 or mmx 360%if ARCH_X86=1 && CONFIG_PIC=1 361; x_offset == 0, reuse x_offset reg 362%define tempq x_offsetq 363 add y_offsetq, g_bilin_filterm 364%define filter_y_a [y_offsetq] 365%define filter_y_b [y_offsetq+16] 366 mov tempq, g_pw_8m 367%define filter_rnd [tempq] 368%else 369 add y_offsetq, bilin_filter 370%define filter_y_a [y_offsetq] 371%define filter_y_b [y_offsetq+16] 372%define filter_rnd [pw_8] 373%endif 374%endif 375 376.x_zero_y_other_loop: 377%if %1 == 16 378 movu m0, [srcq] 379 movu m4, [srcq+src_strideq] 380 mova m1, [dstq] 381%if cpuflag(ssse3) 382 punpckhbw m2, m0, m4 383 punpcklbw m0, m4 384 pmaddubsw m2, filter_y_a 385 pmaddubsw m0, filter_y_a 386 paddw m2, filter_rnd 387 paddw m0, filter_rnd 388%else 389 punpckhbw m2, m0, m5 390 punpckhbw m3, m4, m5 391 punpcklbw m0, m5 392 punpcklbw m4, m5 393 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can 394 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of 395 ; instructions is the same (5), but it is 1 mul instead of 2, so might be 396 ; slightly faster because of pmullw latency. It would also cut our rodata 397 ; tables in half for this function, and save 1-2 registers on x86-64. 398 pmullw m2, filter_y_a 399 pmullw m3, filter_y_b 400 paddw m2, filter_rnd 401 pmullw m0, filter_y_a 402 pmullw m4, filter_y_b 403 paddw m0, filter_rnd 404 paddw m2, m3 405 paddw m0, m4 406%endif 407 psraw m2, 4 408 psraw m0, 4 409%if %2 == 1 ; avg 410 ; FIXME(rbultje) pipeline 411 packuswb m0, m2 412 pavgb m0, [secq] 413 punpckhbw m2, m0, m5 414 punpcklbw m0, m5 415%endif 416 punpckhbw m3, m1, m5 417 punpcklbw m1, m5 418 SUM_SSE m0, m1, m2, m3, m6, m7 419 420 add srcq, src_strideq 421 add dstq, dst_strideq 422%else ; %1 < 16 423 movh m0, [srcq] 424 movh m2, [srcq+src_strideq] 425 movh m4, [srcq+src_strideq*2] 426 movh m3, [dstq+dst_strideq] 427%if cpuflag(ssse3) 428 movh m1, [dstq] 429 punpcklbw m0, m2 430 punpcklbw m2, m4 431 pmaddubsw m0, filter_y_a 432 pmaddubsw m2, filter_y_a 433 punpcklbw m3, m5 434 paddw m2, filter_rnd 435 paddw m0, filter_rnd 436%else 437 punpcklbw m0, m5 438 punpcklbw m2, m5 439 punpcklbw m4, m5 440 pmullw m0, filter_y_a 441 pmullw m1, m2, filter_y_b 442 punpcklbw m3, m5 443 paddw m0, filter_rnd 444 pmullw m2, filter_y_a 445 pmullw m4, filter_y_b 446 paddw m0, m1 447 paddw m2, filter_rnd 448 movh m1, [dstq] 449 paddw m2, m4 450%endif 451 psraw m0, 4 452 psraw m2, 4 453%if %2 == 1 ; avg 454 ; FIXME(rbultje) pipeline 455 packuswb m0, m2 456 pavgb m0, [secq] 457 punpckhbw m2, m0, m5 458 punpcklbw m0, m5 459%endif 460 punpcklbw m1, m5 461 SUM_SSE m0, m1, m2, m3, m6, m7 462 463 lea srcq, [srcq+src_strideq*2] 464 lea dstq, [dstq+dst_strideq*2] 465%endif 466%if %2 == 1 ; avg 467 add secq, sec_str 468%endif 469 dec block_height 470 jg .x_zero_y_other_loop 471%undef filter_y_a 472%undef filter_y_b 473%undef filter_rnd 474 STORE_AND_RET 475 476.x_nonzero: 477 cmp x_offsetd, 8 478 jne .x_nonhalf 479 ; x_offset == 0.5 480 test y_offsetd, y_offsetd 481 jnz .x_half_y_nonzero 482 483 ; x_offset == 0.5 && y_offset == 0 484.x_half_y_zero_loop: 485%if %1 == 16 486 movu m0, [srcq] 487 movu m4, [srcq+1] 488 mova m1, [dstq] 489 pavgb m0, m4 490 punpckhbw m3, m1, m5 491%if %2 == 1 ; avg 492 pavgb m0, [secq] 493%endif 494 punpcklbw m1, m5 495 punpckhbw m2, m0, m5 496 punpcklbw m0, m5 497 SUM_SSE m0, m1, m2, m3, m6, m7 498 499 add srcq, src_strideq 500 add dstq, dst_strideq 501%else ; %1 < 16 502 movh m0, [srcq] 503 movh m4, [srcq+1] 504%if %2 == 1 ; avg 505%if mmsize == 16 506 movhps m0, [srcq+src_strideq] 507 movhps m4, [srcq+src_strideq+1] 508%else ; mmsize == 8 509 punpckldq m0, [srcq+src_strideq] 510 punpckldq m4, [srcq+src_strideq+1] 511%endif 512 movh m1, [dstq] 513 movh m3, [dstq+dst_strideq] 514 pavgb m0, m4 515 punpcklbw m3, m5 516 pavgb m0, [secq] 517 punpcklbw m1, m5 518 punpckhbw m2, m0, m5 519 punpcklbw m0, m5 520%else ; !avg 521 movh m2, [srcq+src_strideq] 522 movh m1, [dstq] 523 pavgb m0, m4 524 movh m4, [srcq+src_strideq+1] 525 movh m3, [dstq+dst_strideq] 526 pavgb m2, m4 527 punpcklbw m0, m5 528 punpcklbw m2, m5 529 punpcklbw m3, m5 530 punpcklbw m1, m5 531%endif 532 SUM_SSE m0, m1, m2, m3, m6, m7 533 534 lea srcq, [srcq+src_strideq*2] 535 lea dstq, [dstq+dst_strideq*2] 536%endif 537%if %2 == 1 ; avg 538 add secq, sec_str 539%endif 540 dec block_height 541 jg .x_half_y_zero_loop 542 STORE_AND_RET 543 544.x_half_y_nonzero: 545 cmp y_offsetd, 8 546 jne .x_half_y_nonhalf 547 548 ; x_offset == 0.5 && y_offset == 0.5 549%if %1 == 16 550 movu m0, [srcq] 551 movu m3, [srcq+1] 552 add srcq, src_strideq 553 pavgb m0, m3 554.x_half_y_half_loop: 555 movu m4, [srcq] 556 movu m3, [srcq+1] 557 mova m1, [dstq] 558 pavgb m4, m3 559 punpckhbw m3, m1, m5 560 pavgb m0, m4 561%if %2 == 1 ; avg 562 punpcklbw m1, m5 563 pavgb m0, [secq] 564 punpckhbw m2, m0, m5 565 punpcklbw m0, m5 566%else 567 punpckhbw m2, m0, m5 568 punpcklbw m0, m5 569 punpcklbw m1, m5 570%endif 571 SUM_SSE m0, m1, m2, m3, m6, m7 572 mova m0, m4 573 574 add srcq, src_strideq 575 add dstq, dst_strideq 576%else ; %1 < 16 577 movh m0, [srcq] 578 movh m3, [srcq+1] 579 add srcq, src_strideq 580 pavgb m0, m3 581.x_half_y_half_loop: 582 movh m2, [srcq] 583 movh m3, [srcq+1] 584%if %2 == 1 ; avg 585%if mmsize == 16 586 movhps m2, [srcq+src_strideq] 587 movhps m3, [srcq+src_strideq+1] 588%else 589%if %1 == 4 590 movh m1, [srcq+src_strideq] 591 punpckldq m2, m1 592 movh m1, [srcq+src_strideq+1] 593 punpckldq m3, m1 594%else 595 punpckldq m2, [srcq+src_strideq] 596 punpckldq m3, [srcq+src_strideq+1] 597%endif 598%endif 599 pavgb m2, m3 600%if mmsize == 16 601 movlhps m0, m2 602 movhlps m4, m2 603%else ; mmsize == 8 604 punpckldq m0, m2 605 pshufw m4, m2, 0xe 606%endif 607 movh m1, [dstq] 608 pavgb m0, m2 609 movh m3, [dstq+dst_strideq] 610 pavgb m0, [secq] 611 punpcklbw m3, m5 612 punpcklbw m1, m5 613 punpckhbw m2, m0, m5 614 punpcklbw m0, m5 615%else ; !avg 616 movh m4, [srcq+src_strideq] 617 movh m1, [srcq+src_strideq+1] 618 pavgb m2, m3 619 pavgb m4, m1 620 pavgb m0, m2 621 pavgb m2, m4 622 movh m1, [dstq] 623 movh m3, [dstq+dst_strideq] 624 punpcklbw m0, m5 625 punpcklbw m2, m5 626 punpcklbw m3, m5 627 punpcklbw m1, m5 628%endif 629 SUM_SSE m0, m1, m2, m3, m6, m7 630 mova m0, m4 631 632 lea srcq, [srcq+src_strideq*2] 633 lea dstq, [dstq+dst_strideq*2] 634%endif 635%if %2 == 1 ; avg 636 add secq, sec_str 637%endif 638 dec block_height 639 jg .x_half_y_half_loop 640 STORE_AND_RET 641 642.x_half_y_nonhalf: 643 ; x_offset == 0.5 && y_offset == bilin interpolation 644%ifdef PIC 645 lea bilin_filter, [bilin_filter_m] 646%endif 647 shl y_offsetd, filter_idx_shift 648%if ARCH_X86_64 && mmsize == 16 649 mova m8, [bilin_filter+y_offsetq] 650%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 651 mova m9, [bilin_filter+y_offsetq+16] 652%endif 653 mova m10, [pw_8] 654%define filter_y_a m8 655%define filter_y_b m9 656%define filter_rnd m10 657%else ;x86_32 658%if ARCH_X86=1 && CONFIG_PIC=1 659; x_offset == 0.5. We can reuse x_offset reg 660%define tempq x_offsetq 661 add y_offsetq, g_bilin_filterm 662%define filter_y_a [y_offsetq] 663%define filter_y_b [y_offsetq+16] 664 mov tempq, g_pw_8m 665%define filter_rnd [tempq] 666%else 667 add y_offsetq, bilin_filter 668%define filter_y_a [y_offsetq] 669%define filter_y_b [y_offsetq+16] 670%define filter_rnd [pw_8] 671%endif 672%endif 673 674%if %1 == 16 675 movu m0, [srcq] 676 movu m3, [srcq+1] 677 add srcq, src_strideq 678 pavgb m0, m3 679.x_half_y_other_loop: 680 movu m4, [srcq] 681 movu m2, [srcq+1] 682 mova m1, [dstq] 683 pavgb m4, m2 684%if cpuflag(ssse3) 685 punpckhbw m2, m0, m4 686 punpcklbw m0, m4 687 pmaddubsw m2, filter_y_a 688 pmaddubsw m0, filter_y_a 689 paddw m2, filter_rnd 690 paddw m0, filter_rnd 691 psraw m2, 4 692%else 693 punpckhbw m2, m0, m5 694 punpckhbw m3, m4, m5 695 pmullw m2, filter_y_a 696 pmullw m3, filter_y_b 697 paddw m2, filter_rnd 698 punpcklbw m0, m5 699 paddw m2, m3 700 punpcklbw m3, m4, m5 701 pmullw m0, filter_y_a 702 pmullw m3, filter_y_b 703 paddw m0, filter_rnd 704 psraw m2, 4 705 paddw m0, m3 706%endif 707 punpckhbw m3, m1, m5 708 psraw m0, 4 709%if %2 == 1 ; avg 710 ; FIXME(rbultje) pipeline 711 packuswb m0, m2 712 pavgb m0, [secq] 713 punpckhbw m2, m0, m5 714 punpcklbw m0, m5 715%endif 716 punpcklbw m1, m5 717 SUM_SSE m0, m1, m2, m3, m6, m7 718 mova m0, m4 719 720 add srcq, src_strideq 721 add dstq, dst_strideq 722%else ; %1 < 16 723 movh m0, [srcq] 724 movh m3, [srcq+1] 725 add srcq, src_strideq 726 pavgb m0, m3 727%if notcpuflag(ssse3) 728 punpcklbw m0, m5 729%endif 730.x_half_y_other_loop: 731 movh m2, [srcq] 732 movh m1, [srcq+1] 733 movh m4, [srcq+src_strideq] 734 movh m3, [srcq+src_strideq+1] 735 pavgb m2, m1 736 pavgb m4, m3 737 movh m3, [dstq+dst_strideq] 738%if cpuflag(ssse3) 739 movh m1, [dstq] 740 punpcklbw m0, m2 741 punpcklbw m2, m4 742 pmaddubsw m0, filter_y_a 743 pmaddubsw m2, filter_y_a 744 punpcklbw m3, m5 745 paddw m0, filter_rnd 746 paddw m2, filter_rnd 747%else 748 punpcklbw m2, m5 749 punpcklbw m4, m5 750 pmullw m0, filter_y_a 751 pmullw m1, m2, filter_y_b 752 punpcklbw m3, m5 753 paddw m0, filter_rnd 754 pmullw m2, filter_y_a 755 paddw m0, m1 756 pmullw m1, m4, filter_y_b 757 paddw m2, filter_rnd 758 paddw m2, m1 759 movh m1, [dstq] 760%endif 761 psraw m0, 4 762 psraw m2, 4 763%if %2 == 1 ; avg 764 ; FIXME(rbultje) pipeline 765 packuswb m0, m2 766 pavgb m0, [secq] 767 punpckhbw m2, m0, m5 768 punpcklbw m0, m5 769%endif 770 punpcklbw m1, m5 771 SUM_SSE m0, m1, m2, m3, m6, m7 772 mova m0, m4 773 774 lea srcq, [srcq+src_strideq*2] 775 lea dstq, [dstq+dst_strideq*2] 776%endif 777%if %2 == 1 ; avg 778 add secq, sec_str 779%endif 780 dec block_height 781 jg .x_half_y_other_loop 782%undef filter_y_a 783%undef filter_y_b 784%undef filter_rnd 785 STORE_AND_RET 786 787.x_nonhalf: 788 test y_offsetd, y_offsetd 789 jnz .x_nonhalf_y_nonzero 790 791 ; x_offset == bilin interpolation && y_offset == 0 792%ifdef PIC 793 lea bilin_filter, [bilin_filter_m] 794%endif 795 shl x_offsetd, filter_idx_shift 796%if ARCH_X86_64 && mmsize == 16 797 mova m8, [bilin_filter+x_offsetq] 798%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 799 mova m9, [bilin_filter+x_offsetq+16] 800%endif 801 mova m10, [pw_8] 802%define filter_x_a m8 803%define filter_x_b m9 804%define filter_rnd m10 805%else ; x86-32 806%if ARCH_X86=1 && CONFIG_PIC=1 807;y_offset == 0. We can reuse y_offset reg. 808%define tempq y_offsetq 809 add x_offsetq, g_bilin_filterm 810%define filter_x_a [x_offsetq] 811%define filter_x_b [x_offsetq+16] 812 mov tempq, g_pw_8m 813%define filter_rnd [tempq] 814%else 815 add x_offsetq, bilin_filter 816%define filter_x_a [x_offsetq] 817%define filter_x_b [x_offsetq+16] 818%define filter_rnd [pw_8] 819%endif 820%endif 821 822.x_other_y_zero_loop: 823%if %1 == 16 824 movu m0, [srcq] 825 movu m4, [srcq+1] 826 mova m1, [dstq] 827%if cpuflag(ssse3) 828 punpckhbw m2, m0, m4 829 punpcklbw m0, m4 830 pmaddubsw m2, filter_x_a 831 pmaddubsw m0, filter_x_a 832 paddw m2, filter_rnd 833 paddw m0, filter_rnd 834%else 835 punpckhbw m2, m0, m5 836 punpckhbw m3, m4, m5 837 punpcklbw m0, m5 838 punpcklbw m4, m5 839 pmullw m2, filter_x_a 840 pmullw m3, filter_x_b 841 paddw m2, filter_rnd 842 pmullw m0, filter_x_a 843 pmullw m4, filter_x_b 844 paddw m0, filter_rnd 845 paddw m2, m3 846 paddw m0, m4 847%endif 848 psraw m2, 4 849 psraw m0, 4 850%if %2 == 1 ; avg 851 ; FIXME(rbultje) pipeline 852 packuswb m0, m2 853 pavgb m0, [secq] 854 punpckhbw m2, m0, m5 855 punpcklbw m0, m5 856%endif 857 punpckhbw m3, m1, m5 858 punpcklbw m1, m5 859 SUM_SSE m0, m1, m2, m3, m6, m7 860 861 add srcq, src_strideq 862 add dstq, dst_strideq 863%else ; %1 < 16 864 movh m0, [srcq] 865 movh m1, [srcq+1] 866 movh m2, [srcq+src_strideq] 867 movh m4, [srcq+src_strideq+1] 868 movh m3, [dstq+dst_strideq] 869%if cpuflag(ssse3) 870 punpcklbw m0, m1 871 movh m1, [dstq] 872 punpcklbw m2, m4 873 pmaddubsw m0, filter_x_a 874 pmaddubsw m2, filter_x_a 875 punpcklbw m3, m5 876 paddw m0, filter_rnd 877 paddw m2, filter_rnd 878%else 879 punpcklbw m0, m5 880 punpcklbw m1, m5 881 punpcklbw m2, m5 882 punpcklbw m4, m5 883 pmullw m0, filter_x_a 884 pmullw m1, filter_x_b 885 punpcklbw m3, m5 886 paddw m0, filter_rnd 887 pmullw m2, filter_x_a 888 pmullw m4, filter_x_b 889 paddw m0, m1 890 paddw m2, filter_rnd 891 movh m1, [dstq] 892 paddw m2, m4 893%endif 894 psraw m0, 4 895 psraw m2, 4 896%if %2 == 1 ; avg 897 ; FIXME(rbultje) pipeline 898 packuswb m0, m2 899 pavgb m0, [secq] 900 punpckhbw m2, m0, m5 901 punpcklbw m0, m5 902%endif 903 punpcklbw m1, m5 904 SUM_SSE m0, m1, m2, m3, m6, m7 905 906 lea srcq, [srcq+src_strideq*2] 907 lea dstq, [dstq+dst_strideq*2] 908%endif 909%if %2 == 1 ; avg 910 add secq, sec_str 911%endif 912 dec block_height 913 jg .x_other_y_zero_loop 914%undef filter_x_a 915%undef filter_x_b 916%undef filter_rnd 917 STORE_AND_RET 918 919.x_nonhalf_y_nonzero: 920 cmp y_offsetd, 8 921 jne .x_nonhalf_y_nonhalf 922 923 ; x_offset == bilin interpolation && y_offset == 0.5 924%ifdef PIC 925 lea bilin_filter, [bilin_filter_m] 926%endif 927 shl x_offsetd, filter_idx_shift 928%if ARCH_X86_64 && mmsize == 16 929 mova m8, [bilin_filter+x_offsetq] 930%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 931 mova m9, [bilin_filter+x_offsetq+16] 932%endif 933 mova m10, [pw_8] 934%define filter_x_a m8 935%define filter_x_b m9 936%define filter_rnd m10 937%else ; x86-32 938%if ARCH_X86=1 && CONFIG_PIC=1 939; y_offset == 0.5. We can reuse y_offset reg. 940%define tempq y_offsetq 941 add x_offsetq, g_bilin_filterm 942%define filter_x_a [x_offsetq] 943%define filter_x_b [x_offsetq+16] 944 mov tempq, g_pw_8m 945%define filter_rnd [tempq] 946%else 947 add x_offsetq, bilin_filter 948%define filter_x_a [x_offsetq] 949%define filter_x_b [x_offsetq+16] 950%define filter_rnd [pw_8] 951%endif 952%endif 953 954%if %1 == 16 955 movu m0, [srcq] 956 movu m1, [srcq+1] 957%if cpuflag(ssse3) 958 punpckhbw m2, m0, m1 959 punpcklbw m0, m1 960 pmaddubsw m2, filter_x_a 961 pmaddubsw m0, filter_x_a 962 paddw m2, filter_rnd 963 paddw m0, filter_rnd 964%else 965 punpckhbw m2, m0, m5 966 punpckhbw m3, m1, m5 967 punpcklbw m0, m5 968 punpcklbw m1, m5 969 pmullw m0, filter_x_a 970 pmullw m1, filter_x_b 971 paddw m0, filter_rnd 972 pmullw m2, filter_x_a 973 pmullw m3, filter_x_b 974 paddw m2, filter_rnd 975 paddw m0, m1 976 paddw m2, m3 977%endif 978 psraw m0, 4 979 psraw m2, 4 980 add srcq, src_strideq 981 packuswb m0, m2 982.x_other_y_half_loop: 983 movu m4, [srcq] 984 movu m3, [srcq+1] 985%if cpuflag(ssse3) 986 mova m1, [dstq] 987 punpckhbw m2, m4, m3 988 punpcklbw m4, m3 989 pmaddubsw m2, filter_x_a 990 pmaddubsw m4, filter_x_a 991 paddw m2, filter_rnd 992 paddw m4, filter_rnd 993 psraw m2, 4 994 psraw m4, 4 995 packuswb m4, m2 996 pavgb m0, m4 997 punpckhbw m3, m1, m5 998 punpcklbw m1, m5 999%else 1000 punpckhbw m2, m4, m5 1001 punpckhbw m1, m3, m5 1002 punpcklbw m4, m5 1003 punpcklbw m3, m5 1004 pmullw m4, filter_x_a 1005 pmullw m3, filter_x_b 1006 paddw m4, filter_rnd 1007 pmullw m2, filter_x_a 1008 pmullw m1, filter_x_b 1009 paddw m2, filter_rnd 1010 paddw m4, m3 1011 paddw m2, m1 1012 mova m1, [dstq] 1013 psraw m4, 4 1014 psraw m2, 4 1015 punpckhbw m3, m1, m5 1016 ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we 1017 ; have a 1-register shortage to be able to store the backup of the bilin 1018 ; filtered second line as words as cache for the next line. Packing into 1019 ; a byte costs 1 pack and 2 unpacks, but saves a register. 1020 packuswb m4, m2 1021 punpcklbw m1, m5 1022 pavgb m0, m4 1023%endif 1024%if %2 == 1 ; avg 1025 ; FIXME(rbultje) pipeline 1026 pavgb m0, [secq] 1027%endif 1028 punpckhbw m2, m0, m5 1029 punpcklbw m0, m5 1030 SUM_SSE m0, m1, m2, m3, m6, m7 1031 mova m0, m4 1032 1033 add srcq, src_strideq 1034 add dstq, dst_strideq 1035%else ; %1 < 16 1036 movh m0, [srcq] 1037 movh m1, [srcq+1] 1038%if cpuflag(ssse3) 1039 punpcklbw m0, m1 1040 pmaddubsw m0, filter_x_a 1041 paddw m0, filter_rnd 1042%else 1043 punpcklbw m0, m5 1044 punpcklbw m1, m5 1045 pmullw m0, filter_x_a 1046 pmullw m1, filter_x_b 1047 paddw m0, filter_rnd 1048 paddw m0, m1 1049%endif 1050 add srcq, src_strideq 1051 psraw m0, 4 1052.x_other_y_half_loop: 1053 movh m2, [srcq] 1054 movh m1, [srcq+1] 1055 movh m4, [srcq+src_strideq] 1056 movh m3, [srcq+src_strideq+1] 1057%if cpuflag(ssse3) 1058 punpcklbw m2, m1 1059 punpcklbw m4, m3 1060 pmaddubsw m2, filter_x_a 1061 pmaddubsw m4, filter_x_a 1062 movh m1, [dstq] 1063 movh m3, [dstq+dst_strideq] 1064 paddw m2, filter_rnd 1065 paddw m4, filter_rnd 1066%else 1067 punpcklbw m2, m5 1068 punpcklbw m1, m5 1069 punpcklbw m4, m5 1070 punpcklbw m3, m5 1071 pmullw m2, filter_x_a 1072 pmullw m1, filter_x_b 1073 paddw m2, filter_rnd 1074 pmullw m4, filter_x_a 1075 pmullw m3, filter_x_b 1076 paddw m4, filter_rnd 1077 paddw m2, m1 1078 movh m1, [dstq] 1079 paddw m4, m3 1080 movh m3, [dstq+dst_strideq] 1081%endif 1082 psraw m2, 4 1083 psraw m4, 4 1084 pavgw m0, m2 1085 pavgw m2, m4 1086%if %2 == 1 ; avg 1087 ; FIXME(rbultje) pipeline - also consider going to bytes here 1088 packuswb m0, m2 1089 pavgb m0, [secq] 1090 punpckhbw m2, m0, m5 1091 punpcklbw m0, m5 1092%endif 1093 punpcklbw m3, m5 1094 punpcklbw m1, m5 1095 SUM_SSE m0, m1, m2, m3, m6, m7 1096 mova m0, m4 1097 1098 lea srcq, [srcq+src_strideq*2] 1099 lea dstq, [dstq+dst_strideq*2] 1100%endif 1101%if %2 == 1 ; avg 1102 add secq, sec_str 1103%endif 1104 dec block_height 1105 jg .x_other_y_half_loop 1106%undef filter_x_a 1107%undef filter_x_b 1108%undef filter_rnd 1109 STORE_AND_RET 1110 1111.x_nonhalf_y_nonhalf: 1112%ifdef PIC 1113 lea bilin_filter, [bilin_filter_m] 1114%endif 1115 shl x_offsetd, filter_idx_shift 1116 shl y_offsetd, filter_idx_shift 1117%if ARCH_X86_64 && mmsize == 16 1118 mova m8, [bilin_filter+x_offsetq] 1119%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1120 mova m9, [bilin_filter+x_offsetq+16] 1121%endif 1122 mova m10, [bilin_filter+y_offsetq] 1123%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1124 mova m11, [bilin_filter+y_offsetq+16] 1125%endif 1126 mova m12, [pw_8] 1127%define filter_x_a m8 1128%define filter_x_b m9 1129%define filter_y_a m10 1130%define filter_y_b m11 1131%define filter_rnd m12 1132%else ; x86-32 1133%if ARCH_X86=1 && CONFIG_PIC=1 1134; In this case, there is NO unused register. Used src_stride register. Later, 1135; src_stride has to be loaded from stack when it is needed. 1136%define tempq src_strideq 1137 mov tempq, g_bilin_filterm 1138 add x_offsetq, tempq 1139 add y_offsetq, tempq 1140%define filter_x_a [x_offsetq] 1141%define filter_x_b [x_offsetq+16] 1142%define filter_y_a [y_offsetq] 1143%define filter_y_b [y_offsetq+16] 1144 1145 mov tempq, g_pw_8m 1146%define filter_rnd [tempq] 1147%else 1148 add x_offsetq, bilin_filter 1149 add y_offsetq, bilin_filter 1150%define filter_x_a [x_offsetq] 1151%define filter_x_b [x_offsetq+16] 1152%define filter_y_a [y_offsetq] 1153%define filter_y_b [y_offsetq+16] 1154%define filter_rnd [pw_8] 1155%endif 1156%endif 1157 1158 ; x_offset == bilin interpolation && y_offset == bilin interpolation 1159%if %1 == 16 1160 movu m0, [srcq] 1161 movu m1, [srcq+1] 1162%if cpuflag(ssse3) 1163 punpckhbw m2, m0, m1 1164 punpcklbw m0, m1 1165 pmaddubsw m2, filter_x_a 1166 pmaddubsw m0, filter_x_a 1167 paddw m2, filter_rnd 1168 paddw m0, filter_rnd 1169%else 1170 punpckhbw m2, m0, m5 1171 punpckhbw m3, m1, m5 1172 punpcklbw m0, m5 1173 punpcklbw m1, m5 1174 pmullw m0, filter_x_a 1175 pmullw m1, filter_x_b 1176 paddw m0, filter_rnd 1177 pmullw m2, filter_x_a 1178 pmullw m3, filter_x_b 1179 paddw m2, filter_rnd 1180 paddw m0, m1 1181 paddw m2, m3 1182%endif 1183 psraw m0, 4 1184 psraw m2, 4 1185 1186 INC_SRC_BY_SRC_STRIDE 1187 1188 packuswb m0, m2 1189.x_other_y_other_loop: 1190%if cpuflag(ssse3) 1191 movu m4, [srcq] 1192 movu m3, [srcq+1] 1193 mova m1, [dstq] 1194 punpckhbw m2, m4, m3 1195 punpcklbw m4, m3 1196 pmaddubsw m2, filter_x_a 1197 pmaddubsw m4, filter_x_a 1198 punpckhbw m3, m1, m5 1199 paddw m2, filter_rnd 1200 paddw m4, filter_rnd 1201 psraw m2, 4 1202 psraw m4, 4 1203 packuswb m4, m2 1204 punpckhbw m2, m0, m4 1205 punpcklbw m0, m4 1206 pmaddubsw m2, filter_y_a 1207 pmaddubsw m0, filter_y_a 1208 punpcklbw m1, m5 1209 paddw m2, filter_rnd 1210 paddw m0, filter_rnd 1211 psraw m2, 4 1212 psraw m0, 4 1213%else 1214 movu m3, [srcq] 1215 movu m4, [srcq+1] 1216 punpckhbw m1, m3, m5 1217 punpckhbw m2, m4, m5 1218 punpcklbw m3, m5 1219 punpcklbw m4, m5 1220 pmullw m3, filter_x_a 1221 pmullw m4, filter_x_b 1222 paddw m3, filter_rnd 1223 pmullw m1, filter_x_a 1224 pmullw m2, filter_x_b 1225 paddw m1, filter_rnd 1226 paddw m3, m4 1227 paddw m1, m2 1228 psraw m3, 4 1229 psraw m1, 4 1230 packuswb m4, m3, m1 1231 punpckhbw m2, m0, m5 1232 punpcklbw m0, m5 1233 pmullw m2, filter_y_a 1234 pmullw m1, filter_y_b 1235 paddw m2, filter_rnd 1236 pmullw m0, filter_y_a 1237 pmullw m3, filter_y_b 1238 paddw m2, m1 1239 mova m1, [dstq] 1240 paddw m0, filter_rnd 1241 psraw m2, 4 1242 paddw m0, m3 1243 punpckhbw m3, m1, m5 1244 psraw m0, 4 1245 punpcklbw m1, m5 1246%endif 1247%if %2 == 1 ; avg 1248 ; FIXME(rbultje) pipeline 1249 packuswb m0, m2 1250 pavgb m0, [secq] 1251 punpckhbw m2, m0, m5 1252 punpcklbw m0, m5 1253%endif 1254 SUM_SSE m0, m1, m2, m3, m6, m7 1255 mova m0, m4 1256 1257 INC_SRC_BY_SRC_STRIDE 1258 add dstq, dst_strideq 1259%else ; %1 < 16 1260 movh m0, [srcq] 1261 movh m1, [srcq+1] 1262%if cpuflag(ssse3) 1263 punpcklbw m0, m1 1264 pmaddubsw m0, filter_x_a 1265 paddw m0, filter_rnd 1266%else 1267 punpcklbw m0, m5 1268 punpcklbw m1, m5 1269 pmullw m0, filter_x_a 1270 pmullw m1, filter_x_b 1271 paddw m0, filter_rnd 1272 paddw m0, m1 1273%endif 1274 psraw m0, 4 1275%if cpuflag(ssse3) 1276 packuswb m0, m0 1277%endif 1278 1279 INC_SRC_BY_SRC_STRIDE 1280 1281.x_other_y_other_loop: 1282 movh m2, [srcq] 1283 movh m1, [srcq+1] 1284 1285 INC_SRC_BY_SRC_STRIDE 1286 movh m4, [srcq] 1287 movh m3, [srcq+1] 1288 1289%if cpuflag(ssse3) 1290 punpcklbw m2, m1 1291 punpcklbw m4, m3 1292 pmaddubsw m2, filter_x_a 1293 pmaddubsw m4, filter_x_a 1294 movh m3, [dstq+dst_strideq] 1295 movh m1, [dstq] 1296 paddw m2, filter_rnd 1297 paddw m4, filter_rnd 1298 psraw m2, 4 1299 psraw m4, 4 1300 packuswb m2, m2 1301 packuswb m4, m4 1302 punpcklbw m0, m2 1303 punpcklbw m2, m4 1304 pmaddubsw m0, filter_y_a 1305 pmaddubsw m2, filter_y_a 1306 punpcklbw m3, m5 1307 paddw m0, filter_rnd 1308 paddw m2, filter_rnd 1309 psraw m0, 4 1310 psraw m2, 4 1311 punpcklbw m1, m5 1312%else 1313 punpcklbw m2, m5 1314 punpcklbw m1, m5 1315 punpcklbw m4, m5 1316 punpcklbw m3, m5 1317 pmullw m2, filter_x_a 1318 pmullw m1, filter_x_b 1319 paddw m2, filter_rnd 1320 pmullw m4, filter_x_a 1321 pmullw m3, filter_x_b 1322 paddw m4, filter_rnd 1323 paddw m2, m1 1324 paddw m4, m3 1325 psraw m2, 4 1326 psraw m4, 4 1327 pmullw m0, filter_y_a 1328 pmullw m3, m2, filter_y_b 1329 paddw m0, filter_rnd 1330 pmullw m2, filter_y_a 1331 pmullw m1, m4, filter_y_b 1332 paddw m2, filter_rnd 1333 paddw m0, m3 1334 movh m3, [dstq+dst_strideq] 1335 paddw m2, m1 1336 movh m1, [dstq] 1337 psraw m0, 4 1338 psraw m2, 4 1339 punpcklbw m3, m5 1340 punpcklbw m1, m5 1341%endif 1342%if %2 == 1 ; avg 1343 ; FIXME(rbultje) pipeline 1344 packuswb m0, m2 1345 pavgb m0, [secq] 1346 punpckhbw m2, m0, m5 1347 punpcklbw m0, m5 1348%endif 1349 SUM_SSE m0, m1, m2, m3, m6, m7 1350 mova m0, m4 1351 1352 INC_SRC_BY_SRC_STRIDE 1353 lea dstq, [dstq+dst_strideq*2] 1354%endif 1355%if %2 == 1 ; avg 1356 add secq, sec_str 1357%endif 1358 dec block_height 1359 jg .x_other_y_other_loop 1360%undef filter_x_a 1361%undef filter_x_b 1362%undef filter_y_a 1363%undef filter_y_b 1364%undef filter_rnd 1365 STORE_AND_RET 1366%endmacro 1367 1368; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical 1369; between the ssse3 and non-ssse3 version. It may make sense to merge their 1370; code in the sense that the ssse3 version would jump to the appropriate 1371; location in the sse/2 version, rather than duplicating that code in the 1372; binary. 1373 1374INIT_MMX sse 1375SUBPEL_VARIANCE 4 1376INIT_XMM sse2 1377SUBPEL_VARIANCE 8 1378SUBPEL_VARIANCE 16 1379 1380INIT_MMX ssse3 1381SUBPEL_VARIANCE 4 1382INIT_XMM ssse3 1383SUBPEL_VARIANCE 8 1384SUBPEL_VARIANCE 16 1385 1386INIT_MMX sse 1387SUBPEL_VARIANCE 4, 1 1388INIT_XMM sse2 1389SUBPEL_VARIANCE 8, 1 1390SUBPEL_VARIANCE 16, 1 1391 1392INIT_MMX ssse3 1393SUBPEL_VARIANCE 4, 1 1394INIT_XMM ssse3 1395SUBPEL_VARIANCE 8, 1 1396SUBPEL_VARIANCE 16, 1 1397