1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14pw_8: times 8 dw 8 15bilin_filter_m_sse2: times 8 dw 16 16 times 8 dw 0 17 times 8 dw 14 18 times 8 dw 2 19 times 8 dw 12 20 times 8 dw 4 21 times 8 dw 10 22 times 8 dw 6 23 times 16 dw 8 24 times 8 dw 6 25 times 8 dw 10 26 times 8 dw 4 27 times 8 dw 12 28 times 8 dw 2 29 times 8 dw 14 30 31bilin_filter_m_ssse3: times 8 db 16, 0 32 times 8 db 14, 2 33 times 8 db 12, 4 34 times 8 db 10, 6 35 times 16 db 8 36 times 8 db 6, 10 37 times 8 db 4, 12 38 times 8 db 2, 14 39 40SECTION .text 41 42; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, 43; int x_offset, int y_offset, 44; const uint8_t *ref, ptrdiff_t ref_stride, 45; int height, unsigned int *sse); 46; 47; This function returns the SE and stores SSE in the given pointer. 48 49%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse 50 psubw %3, %4 51 psubw %1, %2 52 paddw %5, %3 53 pmaddwd %3, %3 54 paddw %5, %1 55 pmaddwd %1, %1 56 paddd %6, %3 57 paddd %6, %1 58%endmacro 59 60%macro STORE_AND_RET 1 61%if %1 > 4 62 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit 63 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. 64 ; We have to sign-extend it before adding the words within the register 65 ; and outputing to a dword. 66 pcmpgtw m5, m6 ; mask for 0 > x 67 movhlps m3, m7 68 punpcklwd m4, m6, m5 69 punpckhwd m6, m5 ; sign-extend m6 word->dword 70 paddd m7, m3 71 paddd m6, m4 72 pshufd m3, m7, 0x1 73 movhlps m4, m6 74 paddd m7, m3 75 paddd m6, m4 76 mov r1, ssem ; r1 = unsigned int *sse 77 pshufd m4, m6, 0x1 78 movd [r1], m7 ; store sse 79 paddd m6, m4 80 movd raxd, m6 ; store sum as return value 81%else ; 4xh 82 pshuflw m4, m6, 0xe 83 pshuflw m3, m7, 0xe 84 paddw m6, m4 85 paddd m7, m3 86 pcmpgtw m5, m6 ; mask for 0 > x 87 mov r1, ssem ; r1 = unsigned int *sse 88 punpcklwd m6, m5 ; sign-extend m6 word->dword 89 movd [r1], m7 ; store sse 90 pshuflw m4, m6, 0xe 91 paddd m6, m4 92 movd raxd, m6 ; store sum as return value 93%endif 94 RET 95%endmacro 96 97%macro INC_SRC_BY_SRC_STRIDE 0 98%if VPX_ARCH_X86=1 && CONFIG_PIC=1 99 add srcq, src_stridemp 100%else 101 add srcq, src_strideq 102%endif 103%endmacro 104 105%macro SUBPEL_VARIANCE 1-2 0 ; W 106%if cpuflag(ssse3) 107%define bilin_filter_m bilin_filter_m_ssse3 108%define filter_idx_shift 4 109%else 110%define bilin_filter_m bilin_filter_m_sse2 111%define filter_idx_shift 5 112%endif 113; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses 114; 11, not 13, if the registers are ordered correctly. May make a minor speed 115; difference on Win64 116 117%if VPX_ARCH_X86_64 118 %if %2 == 1 ; avg 119 cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ 120 x_offset, y_offset, ref, ref_stride, \ 121 second_pred, second_stride, height, sse 122 %define second_str second_strideq 123 %else 124 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ 125 x_offset, y_offset, ref, ref_stride, \ 126 height, sse 127 %endif 128 %define block_height heightd 129 %define bilin_filter sseq 130%else 131 %if CONFIG_PIC=1 132 %if %2 == 1 ; avg 133 cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 134 x_offset, y_offset, ref, ref_stride, \ 135 second_pred, second_stride, height, sse 136 %define block_height dword heightm 137 %define second_str second_stridemp 138 %else 139 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ 140 x_offset, y_offset, ref, ref_stride, \ 141 height, sse 142 %define block_height heightd 143 %endif 144 145 ; reuse argument stack space 146 %define g_bilin_filterm x_offsetm 147 %define g_pw_8m y_offsetm 148 149 ;Store bilin_filter and pw_8 location in stack 150 %if GET_GOT_DEFINED == 1 151 GET_GOT eax 152 add esp, 4 ; restore esp 153 %endif 154 155 lea ecx, [GLOBAL(bilin_filter_m)] 156 mov g_bilin_filterm, ecx 157 158 lea ecx, [GLOBAL(pw_8)] 159 mov g_pw_8m, ecx 160 161 LOAD_IF_USED 0, 1 ; load eax, ecx back 162 %else 163 %if %2 == 1 ; avg 164 cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 165 x_offset, y_offset, \ 166 ref, ref_stride, second_pred, second_stride, \ 167 height, sse 168 %define block_height dword heightm 169 %define second_str second_stridemp 170 %else 171 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ 172 x_offset, y_offset, ref, ref_stride, \ 173 height, sse 174 %define block_height heightd 175 %endif 176 %define bilin_filter bilin_filter_m 177 %endif 178%endif 179 180%if %1 == 4 181 %define movx movd 182%else 183 %define movx movh 184%endif 185 186 ASSERT %1 <= 16 ; m6 overflows if w > 16 187 pxor m6, m6 ; sum 188 pxor m7, m7 ; sse 189 ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we 190 ; could perhaps use it for something more productive then 191 pxor m5, m5 ; dedicated zero register 192%if %1 < 16 193 sar block_height, 1 194%if %2 == 1 ; avg 195 shl second_str, 1 196%endif 197%endif 198 199 ; FIXME(rbultje) replace by jumptable? 200 test x_offsetd, x_offsetd 201 jnz .x_nonzero 202 ; x_offset == 0 203 test y_offsetd, y_offsetd 204 jnz .x_zero_y_nonzero 205 206 ; x_offset == 0 && y_offset == 0 207.x_zero_y_zero_loop: 208%if %1 == 16 209 movu m0, [srcq] 210 mova m1, [refq] 211%if %2 == 1 ; avg 212 pavgb m0, [second_predq] 213 punpckhbw m3, m1, m5 214 punpcklbw m1, m5 215%endif 216 punpckhbw m2, m0, m5 217 punpcklbw m0, m5 218 219%if %2 == 0 ; !avg 220 punpckhbw m3, m1, m5 221 punpcklbw m1, m5 222%endif 223 SUM_SSE m0, m1, m2, m3, m6, m7 224 225 add srcq, src_strideq 226 add refq, ref_strideq 227%else ; %1 < 16 228 movx m0, [srcq] 229%if %2 == 1 ; avg 230%if %1 > 4 231 movhps m0, [srcq+src_strideq] 232%else ; 4xh 233 movx m1, [srcq+src_strideq] 234 punpckldq m0, m1 235%endif 236%else ; !avg 237 movx m2, [srcq+src_strideq] 238%endif 239 240 movx m1, [refq] 241 movx m3, [refq+ref_strideq] 242 243%if %2 == 1 ; avg 244%if %1 > 4 245 pavgb m0, [second_predq] 246%else 247 movh m2, [second_predq] 248 pavgb m0, m2 249%endif 250 punpcklbw m3, m5 251 punpcklbw m1, m5 252%if %1 > 4 253 punpckhbw m2, m0, m5 254 punpcklbw m0, m5 255%else ; 4xh 256 punpcklbw m0, m5 257 movhlps m2, m0 258%endif 259%else ; !avg 260 punpcklbw m0, m5 261 punpcklbw m2, m5 262 punpcklbw m3, m5 263 punpcklbw m1, m5 264%endif 265 SUM_SSE m0, m1, m2, m3, m6, m7 266 267 lea srcq, [srcq+src_strideq*2] 268 lea refq, [refq+ref_strideq*2] 269%endif 270%if %2 == 1 ; avg 271 add second_predq, second_str 272%endif 273 dec block_height 274 jg .x_zero_y_zero_loop 275 STORE_AND_RET %1 276 277.x_zero_y_nonzero: 278 cmp y_offsetd, 4 279 jne .x_zero_y_nonhalf 280 281 ; x_offset == 0 && y_offset == 0.5 282.x_zero_y_half_loop: 283%if %1 == 16 284 movu m0, [srcq] 285 movu m4, [srcq+src_strideq] 286 mova m1, [refq] 287 pavgb m0, m4 288 punpckhbw m3, m1, m5 289%if %2 == 1 ; avg 290 pavgb m0, [second_predq] 291%endif 292 punpcklbw m1, m5 293 punpckhbw m2, m0, m5 294 punpcklbw m0, m5 295 SUM_SSE m0, m1, m2, m3, m6, m7 296 297 add srcq, src_strideq 298 add refq, ref_strideq 299%else ; %1 < 16 300 movx m0, [srcq] 301 movx m2, [srcq+src_strideq] 302%if %2 == 1 ; avg 303%if %1 > 4 304 movhps m2, [srcq+src_strideq*2] 305%else ; 4xh 306 movx m1, [srcq+src_strideq*2] 307 punpckldq m2, m1 308%endif 309 movx m1, [refq] 310%if %1 > 4 311 movlhps m0, m2 312%else ; 4xh 313 punpckldq m0, m2 314%endif 315 movx m3, [refq+ref_strideq] 316 pavgb m0, m2 317 punpcklbw m1, m5 318%if %1 > 4 319 pavgb m0, [second_predq] 320 punpcklbw m3, m5 321 punpckhbw m2, m0, m5 322 punpcklbw m0, m5 323%else ; 4xh 324 movh m4, [second_predq] 325 pavgb m0, m4 326 punpcklbw m3, m5 327 punpcklbw m0, m5 328 movhlps m2, m0 329%endif 330%else ; !avg 331 movx m4, [srcq+src_strideq*2] 332 movx m1, [refq] 333 pavgb m0, m2 334 movx m3, [refq+ref_strideq] 335 pavgb m2, m4 336 punpcklbw m0, m5 337 punpcklbw m2, m5 338 punpcklbw m3, m5 339 punpcklbw m1, m5 340%endif 341 SUM_SSE m0, m1, m2, m3, m6, m7 342 343 lea srcq, [srcq+src_strideq*2] 344 lea refq, [refq+ref_strideq*2] 345%endif 346%if %2 == 1 ; avg 347 add second_predq, second_str 348%endif 349 dec block_height 350 jg .x_zero_y_half_loop 351 STORE_AND_RET %1 352 353.x_zero_y_nonhalf: 354 ; x_offset == 0 && y_offset == bilin interpolation 355%if VPX_ARCH_X86_64 356 lea bilin_filter, [GLOBAL(bilin_filter_m)] 357%endif 358 shl y_offsetd, filter_idx_shift 359%if VPX_ARCH_X86_64 && %1 > 4 360 mova m8, [bilin_filter+y_offsetq] 361%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 362 mova m9, [bilin_filter+y_offsetq+16] 363%endif 364 mova m10, [GLOBAL(pw_8)] 365%define filter_y_a m8 366%define filter_y_b m9 367%define filter_rnd m10 368%else ; x86-32 or mmx 369%if VPX_ARCH_X86=1 && CONFIG_PIC=1 370; x_offset == 0, reuse x_offset reg 371%define tempq x_offsetq 372 add y_offsetq, g_bilin_filterm 373%define filter_y_a [y_offsetq] 374%define filter_y_b [y_offsetq+16] 375 mov tempq, g_pw_8m 376%define filter_rnd [tempq] 377%else 378 add y_offsetq, bilin_filter 379%define filter_y_a [y_offsetq] 380%define filter_y_b [y_offsetq+16] 381%define filter_rnd [GLOBAL(pw_8)] 382%endif 383%endif 384 385.x_zero_y_other_loop: 386%if %1 == 16 387 movu m0, [srcq] 388 movu m4, [srcq+src_strideq] 389 mova m1, [refq] 390%if cpuflag(ssse3) 391 punpckhbw m2, m0, m4 392 punpcklbw m0, m4 393 pmaddubsw m2, filter_y_a 394 pmaddubsw m0, filter_y_a 395 paddw m2, filter_rnd 396 paddw m0, filter_rnd 397%else 398 punpckhbw m2, m0, m5 399 punpckhbw m3, m4, m5 400 punpcklbw m0, m5 401 punpcklbw m4, m5 402 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can 403 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of 404 ; instructions is the same (5), but it is 1 mul instead of 2, so might be 405 ; slightly faster because of pmullw latency. It would also cut our rodata 406 ; tables in half for this function, and save 1-2 registers on x86-64. 407 pmullw m2, filter_y_a 408 pmullw m3, filter_y_b 409 paddw m2, filter_rnd 410 pmullw m0, filter_y_a 411 pmullw m4, filter_y_b 412 paddw m0, filter_rnd 413 paddw m2, m3 414 paddw m0, m4 415%endif 416 psraw m2, 4 417 psraw m0, 4 418%if %2 == 1 ; avg 419 ; FIXME(rbultje) pipeline 420 packuswb m0, m2 421 pavgb m0, [second_predq] 422 punpckhbw m2, m0, m5 423 punpcklbw m0, m5 424%endif 425 punpckhbw m3, m1, m5 426 punpcklbw m1, m5 427 SUM_SSE m0, m1, m2, m3, m6, m7 428 429 add srcq, src_strideq 430 add refq, ref_strideq 431%else ; %1 < 16 432 movx m0, [srcq] 433 movx m2, [srcq+src_strideq] 434 movx m4, [srcq+src_strideq*2] 435 movx m3, [refq+ref_strideq] 436%if cpuflag(ssse3) 437 movx m1, [refq] 438 punpcklbw m0, m2 439 punpcklbw m2, m4 440 pmaddubsw m0, filter_y_a 441 pmaddubsw m2, filter_y_a 442 punpcklbw m3, m5 443 paddw m2, filter_rnd 444 paddw m0, filter_rnd 445%else 446 punpcklbw m0, m5 447 punpcklbw m2, m5 448 punpcklbw m4, m5 449 pmullw m0, filter_y_a 450 pmullw m1, m2, filter_y_b 451 punpcklbw m3, m5 452 paddw m0, filter_rnd 453 pmullw m2, filter_y_a 454 pmullw m4, filter_y_b 455 paddw m0, m1 456 paddw m2, filter_rnd 457 movx m1, [refq] 458 paddw m2, m4 459%endif 460 psraw m0, 4 461 psraw m2, 4 462%if %2 == 1 ; avg 463 ; FIXME(rbultje) pipeline 464%if %1 == 4 465 movlhps m0, m2 466%endif 467 packuswb m0, m2 468%if %1 > 4 469 pavgb m0, [second_predq] 470 punpckhbw m2, m0, m5 471 punpcklbw m0, m5 472%else ; 4xh 473 movh m2, [second_predq] 474 pavgb m0, m2 475 punpcklbw m0, m5 476 movhlps m2, m0 477%endif 478%endif 479 punpcklbw m1, m5 480 SUM_SSE m0, m1, m2, m3, m6, m7 481 482 lea srcq, [srcq+src_strideq*2] 483 lea refq, [refq+ref_strideq*2] 484%endif 485%if %2 == 1 ; avg 486 add second_predq, second_str 487%endif 488 dec block_height 489 jg .x_zero_y_other_loop 490%undef filter_y_a 491%undef filter_y_b 492%undef filter_rnd 493 STORE_AND_RET %1 494 495.x_nonzero: 496 cmp x_offsetd, 4 497 jne .x_nonhalf 498 ; x_offset == 0.5 499 test y_offsetd, y_offsetd 500 jnz .x_half_y_nonzero 501 502 ; x_offset == 0.5 && y_offset == 0 503.x_half_y_zero_loop: 504%if %1 == 16 505 movu m0, [srcq] 506 movu m4, [srcq+1] 507 mova m1, [refq] 508 pavgb m0, m4 509 punpckhbw m3, m1, m5 510%if %2 == 1 ; avg 511 pavgb m0, [second_predq] 512%endif 513 punpcklbw m1, m5 514 punpckhbw m2, m0, m5 515 punpcklbw m0, m5 516 SUM_SSE m0, m1, m2, m3, m6, m7 517 518 add srcq, src_strideq 519 add refq, ref_strideq 520%else ; %1 < 16 521 movx m0, [srcq] 522 movx m4, [srcq+1] 523%if %2 == 1 ; avg 524%if %1 > 4 525 movhps m0, [srcq+src_strideq] 526 movhps m4, [srcq+src_strideq+1] 527%else ; 4xh 528 movx m1, [srcq+src_strideq] 529 punpckldq m0, m1 530 movx m2, [srcq+src_strideq+1] 531 punpckldq m4, m2 532%endif 533 movx m1, [refq] 534 movx m3, [refq+ref_strideq] 535 pavgb m0, m4 536 punpcklbw m3, m5 537%if %1 > 4 538 pavgb m0, [second_predq] 539 punpcklbw m1, m5 540 punpckhbw m2, m0, m5 541 punpcklbw m0, m5 542%else ; 4xh 543 movh m2, [second_predq] 544 pavgb m0, m2 545 punpcklbw m1, m5 546 punpcklbw m0, m5 547 movhlps m2, m0 548%endif 549%else ; !avg 550 movx m2, [srcq+src_strideq] 551 movx m1, [refq] 552 pavgb m0, m4 553 movx m4, [srcq+src_strideq+1] 554 movx m3, [refq+ref_strideq] 555 pavgb m2, m4 556 punpcklbw m0, m5 557 punpcklbw m2, m5 558 punpcklbw m3, m5 559 punpcklbw m1, m5 560%endif 561 SUM_SSE m0, m1, m2, m3, m6, m7 562 563 lea srcq, [srcq+src_strideq*2] 564 lea refq, [refq+ref_strideq*2] 565%endif 566%if %2 == 1 ; avg 567 add second_predq, second_str 568%endif 569 dec block_height 570 jg .x_half_y_zero_loop 571 STORE_AND_RET %1 572 573.x_half_y_nonzero: 574 cmp y_offsetd, 4 575 jne .x_half_y_nonhalf 576 577 ; x_offset == 0.5 && y_offset == 0.5 578%if %1 == 16 579 movu m0, [srcq] 580 movu m3, [srcq+1] 581 add srcq, src_strideq 582 pavgb m0, m3 583.x_half_y_half_loop: 584 movu m4, [srcq] 585 movu m3, [srcq+1] 586 mova m1, [refq] 587 pavgb m4, m3 588 punpckhbw m3, m1, m5 589 pavgb m0, m4 590%if %2 == 1 ; avg 591 punpcklbw m1, m5 592 pavgb m0, [second_predq] 593 punpckhbw m2, m0, m5 594 punpcklbw m0, m5 595%else 596 punpckhbw m2, m0, m5 597 punpcklbw m0, m5 598 punpcklbw m1, m5 599%endif 600 SUM_SSE m0, m1, m2, m3, m6, m7 601 mova m0, m4 602 603 add srcq, src_strideq 604 add refq, ref_strideq 605%else ; %1 < 16 606 movx m0, [srcq] 607 movx m3, [srcq+1] 608 add srcq, src_strideq 609 pavgb m0, m3 610.x_half_y_half_loop: 611 movx m2, [srcq] 612 movx m3, [srcq+1] 613%if %2 == 1 ; avg 614%if %1 > 4 615 movhps m2, [srcq+src_strideq] 616 movhps m3, [srcq+src_strideq+1] 617%else 618 movx m1, [srcq+src_strideq] 619 punpckldq m2, m1 620 movx m1, [srcq+src_strideq+1] 621 punpckldq m3, m1 622%endif 623 pavgb m2, m3 624%if %1 > 4 625 movlhps m0, m2 626 movhlps m4, m2 627%else ; 4xh 628 punpckldq m0, m2 629 pshuflw m4, m2, 0xe 630%endif 631 movx m1, [refq] 632 pavgb m0, m2 633 movx m3, [refq+ref_strideq] 634%if %1 > 4 635 pavgb m0, [second_predq] 636%else 637 movh m2, [second_predq] 638 pavgb m0, m2 639%endif 640 punpcklbw m3, m5 641 punpcklbw m1, m5 642%if %1 > 4 643 punpckhbw m2, m0, m5 644 punpcklbw m0, m5 645%else 646 punpcklbw m0, m5 647 movhlps m2, m0 648%endif 649%else ; !avg 650 movx m4, [srcq+src_strideq] 651 movx m1, [srcq+src_strideq+1] 652 pavgb m2, m3 653 pavgb m4, m1 654 pavgb m0, m2 655 pavgb m2, m4 656 movx m1, [refq] 657 movx m3, [refq+ref_strideq] 658 punpcklbw m0, m5 659 punpcklbw m2, m5 660 punpcklbw m3, m5 661 punpcklbw m1, m5 662%endif 663 SUM_SSE m0, m1, m2, m3, m6, m7 664 mova m0, m4 665 666 lea srcq, [srcq+src_strideq*2] 667 lea refq, [refq+ref_strideq*2] 668%endif 669%if %2 == 1 ; avg 670 add second_predq, second_str 671%endif 672 dec block_height 673 jg .x_half_y_half_loop 674 STORE_AND_RET %1 675 676.x_half_y_nonhalf: 677 ; x_offset == 0.5 && y_offset == bilin interpolation 678%if VPX_ARCH_X86_64 679 lea bilin_filter, [GLOBAL(bilin_filter_m)] 680%endif 681 shl y_offsetd, filter_idx_shift 682%if VPX_ARCH_X86_64 && %1 > 4 683 mova m8, [bilin_filter+y_offsetq] 684%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 685 mova m9, [bilin_filter+y_offsetq+16] 686%endif 687 mova m10, [GLOBAL(pw_8)] 688%define filter_y_a m8 689%define filter_y_b m9 690%define filter_rnd m10 691%else ;x86_32 692%if VPX_ARCH_X86=1 && CONFIG_PIC=1 693; x_offset == 0.5. We can reuse x_offset reg 694%define tempq x_offsetq 695 add y_offsetq, g_bilin_filterm 696%define filter_y_a [y_offsetq] 697%define filter_y_b [y_offsetq+16] 698 mov tempq, g_pw_8m 699%define filter_rnd [tempq] 700%else 701 add y_offsetq, bilin_filter 702%define filter_y_a [y_offsetq] 703%define filter_y_b [y_offsetq+16] 704%define filter_rnd [GLOBAL(pw_8)] 705%endif 706%endif 707 708%if %1 == 16 709 movu m0, [srcq] 710 movu m3, [srcq+1] 711 add srcq, src_strideq 712 pavgb m0, m3 713.x_half_y_other_loop: 714 movu m4, [srcq] 715 movu m2, [srcq+1] 716 mova m1, [refq] 717 pavgb m4, m2 718%if cpuflag(ssse3) 719 punpckhbw m2, m0, m4 720 punpcklbw m0, m4 721 pmaddubsw m2, filter_y_a 722 pmaddubsw m0, filter_y_a 723 paddw m2, filter_rnd 724 paddw m0, filter_rnd 725 psraw m2, 4 726%else 727 punpckhbw m2, m0, m5 728 punpckhbw m3, m4, m5 729 pmullw m2, filter_y_a 730 pmullw m3, filter_y_b 731 paddw m2, filter_rnd 732 punpcklbw m0, m5 733 paddw m2, m3 734 punpcklbw m3, m4, m5 735 pmullw m0, filter_y_a 736 pmullw m3, filter_y_b 737 paddw m0, filter_rnd 738 psraw m2, 4 739 paddw m0, m3 740%endif 741 punpckhbw m3, m1, m5 742 psraw m0, 4 743%if %2 == 1 ; avg 744 ; FIXME(rbultje) pipeline 745 packuswb m0, m2 746 pavgb m0, [second_predq] 747 punpckhbw m2, m0, m5 748 punpcklbw m0, m5 749%endif 750 punpcklbw m1, m5 751 SUM_SSE m0, m1, m2, m3, m6, m7 752 mova m0, m4 753 754 add srcq, src_strideq 755 add refq, ref_strideq 756%else ; %1 < 16 757 movx m0, [srcq] 758 movx m3, [srcq+1] 759 add srcq, src_strideq 760 pavgb m0, m3 761%if notcpuflag(ssse3) 762 punpcklbw m0, m5 763%endif 764.x_half_y_other_loop: 765 movx m2, [srcq] 766 movx m1, [srcq+1] 767 movx m4, [srcq+src_strideq] 768 movx m3, [srcq+src_strideq+1] 769 pavgb m2, m1 770 pavgb m4, m3 771 movx m3, [refq+ref_strideq] 772%if cpuflag(ssse3) 773 movx m1, [refq] 774 punpcklbw m0, m2 775 punpcklbw m2, m4 776 pmaddubsw m0, filter_y_a 777 pmaddubsw m2, filter_y_a 778 punpcklbw m3, m5 779 paddw m0, filter_rnd 780 paddw m2, filter_rnd 781%else 782 punpcklbw m2, m5 783 punpcklbw m4, m5 784 pmullw m0, filter_y_a 785 pmullw m1, m2, filter_y_b 786 punpcklbw m3, m5 787 paddw m0, filter_rnd 788 pmullw m2, filter_y_a 789 paddw m0, m1 790 pmullw m1, m4, filter_y_b 791 paddw m2, filter_rnd 792 paddw m2, m1 793 movx m1, [refq] 794%endif 795 psraw m0, 4 796 psraw m2, 4 797%if %2 == 1 ; avg 798 ; FIXME(rbultje) pipeline 799%if %1 == 4 800 movlhps m0, m2 801%endif 802 packuswb m0, m2 803%if %1 > 4 804 pavgb m0, [second_predq] 805 punpckhbw m2, m0, m5 806 punpcklbw m0, m5 807%else 808 movh m2, [second_predq] 809 pavgb m0, m2 810 punpcklbw m0, m5 811 movhlps m2, m0 812%endif 813%endif 814 punpcklbw m1, m5 815 SUM_SSE m0, m1, m2, m3, m6, m7 816 mova m0, m4 817 818 lea srcq, [srcq+src_strideq*2] 819 lea refq, [refq+ref_strideq*2] 820%endif 821%if %2 == 1 ; avg 822 add second_predq, second_str 823%endif 824 dec block_height 825 jg .x_half_y_other_loop 826%undef filter_y_a 827%undef filter_y_b 828%undef filter_rnd 829 STORE_AND_RET %1 830 831.x_nonhalf: 832 test y_offsetd, y_offsetd 833 jnz .x_nonhalf_y_nonzero 834 835 ; x_offset == bilin interpolation && y_offset == 0 836%if VPX_ARCH_X86_64 837 lea bilin_filter, [GLOBAL(bilin_filter_m)] 838%endif 839 shl x_offsetd, filter_idx_shift 840%if VPX_ARCH_X86_64 && %1 > 4 841 mova m8, [bilin_filter+x_offsetq] 842%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 843 mova m9, [bilin_filter+x_offsetq+16] 844%endif 845 mova m10, [GLOBAL(pw_8)] 846%define filter_x_a m8 847%define filter_x_b m9 848%define filter_rnd m10 849%else ; x86-32 850%if VPX_ARCH_X86=1 && CONFIG_PIC=1 851;y_offset == 0. We can reuse y_offset reg. 852%define tempq y_offsetq 853 add x_offsetq, g_bilin_filterm 854%define filter_x_a [x_offsetq] 855%define filter_x_b [x_offsetq+16] 856 mov tempq, g_pw_8m 857%define filter_rnd [tempq] 858%else 859 add x_offsetq, bilin_filter 860%define filter_x_a [x_offsetq] 861%define filter_x_b [x_offsetq+16] 862%define filter_rnd [GLOBAL(pw_8)] 863%endif 864%endif 865 866.x_other_y_zero_loop: 867%if %1 == 16 868 movu m0, [srcq] 869 movu m4, [srcq+1] 870 mova m1, [refq] 871%if cpuflag(ssse3) 872 punpckhbw m2, m0, m4 873 punpcklbw m0, m4 874 pmaddubsw m2, filter_x_a 875 pmaddubsw m0, filter_x_a 876 paddw m2, filter_rnd 877 paddw m0, filter_rnd 878%else 879 punpckhbw m2, m0, m5 880 punpckhbw m3, m4, m5 881 punpcklbw m0, m5 882 punpcklbw m4, m5 883 pmullw m2, filter_x_a 884 pmullw m3, filter_x_b 885 paddw m2, filter_rnd 886 pmullw m0, filter_x_a 887 pmullw m4, filter_x_b 888 paddw m0, filter_rnd 889 paddw m2, m3 890 paddw m0, m4 891%endif 892 psraw m2, 4 893 psraw m0, 4 894%if %2 == 1 ; avg 895 ; FIXME(rbultje) pipeline 896 packuswb m0, m2 897 pavgb m0, [second_predq] 898 punpckhbw m2, m0, m5 899 punpcklbw m0, m5 900%endif 901 punpckhbw m3, m1, m5 902 punpcklbw m1, m5 903 SUM_SSE m0, m1, m2, m3, m6, m7 904 905 add srcq, src_strideq 906 add refq, ref_strideq 907%else ; %1 < 16 908 movx m0, [srcq] 909 movx m1, [srcq+1] 910 movx m2, [srcq+src_strideq] 911 movx m4, [srcq+src_strideq+1] 912 movx m3, [refq+ref_strideq] 913%if cpuflag(ssse3) 914 punpcklbw m0, m1 915 movx m1, [refq] 916 punpcklbw m2, m4 917 pmaddubsw m0, filter_x_a 918 pmaddubsw m2, filter_x_a 919 punpcklbw m3, m5 920 paddw m0, filter_rnd 921 paddw m2, filter_rnd 922%else 923 punpcklbw m0, m5 924 punpcklbw m1, m5 925 punpcklbw m2, m5 926 punpcklbw m4, m5 927 pmullw m0, filter_x_a 928 pmullw m1, filter_x_b 929 punpcklbw m3, m5 930 paddw m0, filter_rnd 931 pmullw m2, filter_x_a 932 pmullw m4, filter_x_b 933 paddw m0, m1 934 paddw m2, filter_rnd 935 movx m1, [refq] 936 paddw m2, m4 937%endif 938 psraw m0, 4 939 psraw m2, 4 940%if %2 == 1 ; avg 941 ; FIXME(rbultje) pipeline 942%if %1 == 4 943 movlhps m0, m2 944%endif 945 packuswb m0, m2 946%if %1 > 4 947 pavgb m0, [second_predq] 948 punpckhbw m2, m0, m5 949 punpcklbw m0, m5 950%else 951 movh m2, [second_predq] 952 pavgb m0, m2 953 punpcklbw m0, m5 954 movhlps m2, m0 955%endif 956%endif 957 punpcklbw m1, m5 958 SUM_SSE m0, m1, m2, m3, m6, m7 959 960 lea srcq, [srcq+src_strideq*2] 961 lea refq, [refq+ref_strideq*2] 962%endif 963%if %2 == 1 ; avg 964 add second_predq, second_str 965%endif 966 dec block_height 967 jg .x_other_y_zero_loop 968%undef filter_x_a 969%undef filter_x_b 970%undef filter_rnd 971 STORE_AND_RET %1 972 973.x_nonhalf_y_nonzero: 974 cmp y_offsetd, 4 975 jne .x_nonhalf_y_nonhalf 976 977 ; x_offset == bilin interpolation && y_offset == 0.5 978%if VPX_ARCH_X86_64 979 lea bilin_filter, [GLOBAL(bilin_filter_m)] 980%endif 981 shl x_offsetd, filter_idx_shift 982%if VPX_ARCH_X86_64 && %1 > 4 983 mova m8, [bilin_filter+x_offsetq] 984%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 985 mova m9, [bilin_filter+x_offsetq+16] 986%endif 987 mova m10, [GLOBAL(pw_8)] 988%define filter_x_a m8 989%define filter_x_b m9 990%define filter_rnd m10 991%else ; x86-32 992%if VPX_ARCH_X86=1 && CONFIG_PIC=1 993; y_offset == 0.5. We can reuse y_offset reg. 994%define tempq y_offsetq 995 add x_offsetq, g_bilin_filterm 996%define filter_x_a [x_offsetq] 997%define filter_x_b [x_offsetq+16] 998 mov tempq, g_pw_8m 999%define filter_rnd [tempq] 1000%else 1001 add x_offsetq, bilin_filter 1002%define filter_x_a [x_offsetq] 1003%define filter_x_b [x_offsetq+16] 1004%define filter_rnd [GLOBAL(pw_8)] 1005%endif 1006%endif 1007 1008%if %1 == 16 1009 movu m0, [srcq] 1010 movu m1, [srcq+1] 1011%if cpuflag(ssse3) 1012 punpckhbw m2, m0, m1 1013 punpcklbw m0, m1 1014 pmaddubsw m2, filter_x_a 1015 pmaddubsw m0, filter_x_a 1016 paddw m2, filter_rnd 1017 paddw m0, filter_rnd 1018%else 1019 punpckhbw m2, m0, m5 1020 punpckhbw m3, m1, m5 1021 punpcklbw m0, m5 1022 punpcklbw m1, m5 1023 pmullw m0, filter_x_a 1024 pmullw m1, filter_x_b 1025 paddw m0, filter_rnd 1026 pmullw m2, filter_x_a 1027 pmullw m3, filter_x_b 1028 paddw m2, filter_rnd 1029 paddw m0, m1 1030 paddw m2, m3 1031%endif 1032 psraw m0, 4 1033 psraw m2, 4 1034 add srcq, src_strideq 1035 packuswb m0, m2 1036.x_other_y_half_loop: 1037 movu m4, [srcq] 1038 movu m3, [srcq+1] 1039%if cpuflag(ssse3) 1040 mova m1, [refq] 1041 punpckhbw m2, m4, m3 1042 punpcklbw m4, m3 1043 pmaddubsw m2, filter_x_a 1044 pmaddubsw m4, filter_x_a 1045 paddw m2, filter_rnd 1046 paddw m4, filter_rnd 1047 psraw m2, 4 1048 psraw m4, 4 1049 packuswb m4, m2 1050 pavgb m0, m4 1051 punpckhbw m3, m1, m5 1052 punpcklbw m1, m5 1053%else 1054 punpckhbw m2, m4, m5 1055 punpckhbw m1, m3, m5 1056 punpcklbw m4, m5 1057 punpcklbw m3, m5 1058 pmullw m4, filter_x_a 1059 pmullw m3, filter_x_b 1060 paddw m4, filter_rnd 1061 pmullw m2, filter_x_a 1062 pmullw m1, filter_x_b 1063 paddw m2, filter_rnd 1064 paddw m4, m3 1065 paddw m2, m1 1066 mova m1, [refq] 1067 psraw m4, 4 1068 psraw m2, 4 1069 punpckhbw m3, m1, m5 1070 ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we 1071 ; have a 1-register shortage to be able to store the backup of the bilin 1072 ; filtered second line as words as cache for the next line. Packing into 1073 ; a byte costs 1 pack and 2 unpacks, but saves a register. 1074 packuswb m4, m2 1075 punpcklbw m1, m5 1076 pavgb m0, m4 1077%endif 1078%if %2 == 1 ; avg 1079 ; FIXME(rbultje) pipeline 1080 pavgb m0, [second_predq] 1081%endif 1082 punpckhbw m2, m0, m5 1083 punpcklbw m0, m5 1084 SUM_SSE m0, m1, m2, m3, m6, m7 1085 mova m0, m4 1086 1087 add srcq, src_strideq 1088 add refq, ref_strideq 1089%else ; %1 < 16 1090 movx m0, [srcq] 1091 movx m1, [srcq+1] 1092%if cpuflag(ssse3) 1093 punpcklbw m0, m1 1094 pmaddubsw m0, filter_x_a 1095 paddw m0, filter_rnd 1096%else 1097 punpcklbw m0, m5 1098 punpcklbw m1, m5 1099 pmullw m0, filter_x_a 1100 pmullw m1, filter_x_b 1101 paddw m0, filter_rnd 1102 paddw m0, m1 1103%endif 1104 add srcq, src_strideq 1105 psraw m0, 4 1106.x_other_y_half_loop: 1107 movx m2, [srcq] 1108 movx m1, [srcq+1] 1109 movx m4, [srcq+src_strideq] 1110 movx m3, [srcq+src_strideq+1] 1111%if cpuflag(ssse3) 1112 punpcklbw m2, m1 1113 punpcklbw m4, m3 1114 pmaddubsw m2, filter_x_a 1115 pmaddubsw m4, filter_x_a 1116 movx m1, [refq] 1117 movx m3, [refq+ref_strideq] 1118 paddw m2, filter_rnd 1119 paddw m4, filter_rnd 1120%else 1121 punpcklbw m2, m5 1122 punpcklbw m1, m5 1123 punpcklbw m4, m5 1124 punpcklbw m3, m5 1125 pmullw m2, filter_x_a 1126 pmullw m1, filter_x_b 1127 paddw m2, filter_rnd 1128 pmullw m4, filter_x_a 1129 pmullw m3, filter_x_b 1130 paddw m4, filter_rnd 1131 paddw m2, m1 1132 movx m1, [refq] 1133 paddw m4, m3 1134 movx m3, [refq+ref_strideq] 1135%endif 1136 psraw m2, 4 1137 psraw m4, 4 1138 pavgw m0, m2 1139 pavgw m2, m4 1140%if %2 == 1 ; avg 1141 ; FIXME(rbultje) pipeline - also consider going to bytes here 1142%if %1 == 4 1143 movlhps m0, m2 1144%endif 1145 packuswb m0, m2 1146%if %1 > 4 1147 pavgb m0, [second_predq] 1148 punpckhbw m2, m0, m5 1149 punpcklbw m0, m5 1150%else 1151 movh m2, [second_predq] 1152 pavgb m0, m2 1153 punpcklbw m0, m5 1154 movhlps m2, m0 1155%endif 1156%endif 1157 punpcklbw m3, m5 1158 punpcklbw m1, m5 1159 SUM_SSE m0, m1, m2, m3, m6, m7 1160 mova m0, m4 1161 1162 lea srcq, [srcq+src_strideq*2] 1163 lea refq, [refq+ref_strideq*2] 1164%endif 1165%if %2 == 1 ; avg 1166 add second_predq, second_str 1167%endif 1168 dec block_height 1169 jg .x_other_y_half_loop 1170%undef filter_x_a 1171%undef filter_x_b 1172%undef filter_rnd 1173 STORE_AND_RET %1 1174 1175.x_nonhalf_y_nonhalf: 1176%if VPX_ARCH_X86_64 1177 lea bilin_filter, [GLOBAL(bilin_filter_m)] 1178%endif 1179 shl x_offsetd, filter_idx_shift 1180 shl y_offsetd, filter_idx_shift 1181%if VPX_ARCH_X86_64 && %1 > 4 1182 mova m8, [bilin_filter+x_offsetq] 1183%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1184 mova m9, [bilin_filter+x_offsetq+16] 1185%endif 1186 mova m10, [bilin_filter+y_offsetq] 1187%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1188 mova m11, [bilin_filter+y_offsetq+16] 1189%endif 1190 mova m12, [GLOBAL(pw_8)] 1191%define filter_x_a m8 1192%define filter_x_b m9 1193%define filter_y_a m10 1194%define filter_y_b m11 1195%define filter_rnd m12 1196%else ; x86-32 1197%if VPX_ARCH_X86=1 && CONFIG_PIC=1 1198; In this case, there is NO unused register. Used src_stride register. Later, 1199; src_stride has to be loaded from stack when it is needed. 1200%define tempq src_strideq 1201 mov tempq, g_bilin_filterm 1202 add x_offsetq, tempq 1203 add y_offsetq, tempq 1204%define filter_x_a [x_offsetq] 1205%define filter_x_b [x_offsetq+16] 1206%define filter_y_a [y_offsetq] 1207%define filter_y_b [y_offsetq+16] 1208 1209 mov tempq, g_pw_8m 1210%define filter_rnd [tempq] 1211%else 1212 add x_offsetq, bilin_filter 1213 add y_offsetq, bilin_filter 1214%define filter_x_a [x_offsetq] 1215%define filter_x_b [x_offsetq+16] 1216%define filter_y_a [y_offsetq] 1217%define filter_y_b [y_offsetq+16] 1218%define filter_rnd [GLOBAL(pw_8)] 1219%endif 1220%endif 1221 1222 ; x_offset == bilin interpolation && y_offset == bilin interpolation 1223%if %1 == 16 1224 movu m0, [srcq] 1225 movu m1, [srcq+1] 1226%if cpuflag(ssse3) 1227 punpckhbw m2, m0, m1 1228 punpcklbw m0, m1 1229 pmaddubsw m2, filter_x_a 1230 pmaddubsw m0, filter_x_a 1231 paddw m2, filter_rnd 1232 paddw m0, filter_rnd 1233%else 1234 punpckhbw m2, m0, m5 1235 punpckhbw m3, m1, m5 1236 punpcklbw m0, m5 1237 punpcklbw m1, m5 1238 pmullw m0, filter_x_a 1239 pmullw m1, filter_x_b 1240 paddw m0, filter_rnd 1241 pmullw m2, filter_x_a 1242 pmullw m3, filter_x_b 1243 paddw m2, filter_rnd 1244 paddw m0, m1 1245 paddw m2, m3 1246%endif 1247 psraw m0, 4 1248 psraw m2, 4 1249 1250 INC_SRC_BY_SRC_STRIDE 1251 1252 packuswb m0, m2 1253.x_other_y_other_loop: 1254%if cpuflag(ssse3) 1255 movu m4, [srcq] 1256 movu m3, [srcq+1] 1257 mova m1, [refq] 1258 punpckhbw m2, m4, m3 1259 punpcklbw m4, m3 1260 pmaddubsw m2, filter_x_a 1261 pmaddubsw m4, filter_x_a 1262 punpckhbw m3, m1, m5 1263 paddw m2, filter_rnd 1264 paddw m4, filter_rnd 1265 psraw m2, 4 1266 psraw m4, 4 1267 packuswb m4, m2 1268 punpckhbw m2, m0, m4 1269 punpcklbw m0, m4 1270 pmaddubsw m2, filter_y_a 1271 pmaddubsw m0, filter_y_a 1272 punpcklbw m1, m5 1273 paddw m2, filter_rnd 1274 paddw m0, filter_rnd 1275 psraw m2, 4 1276 psraw m0, 4 1277%else 1278 movu m3, [srcq] 1279 movu m4, [srcq+1] 1280 punpckhbw m1, m3, m5 1281 punpckhbw m2, m4, m5 1282 punpcklbw m3, m5 1283 punpcklbw m4, m5 1284 pmullw m3, filter_x_a 1285 pmullw m4, filter_x_b 1286 paddw m3, filter_rnd 1287 pmullw m1, filter_x_a 1288 pmullw m2, filter_x_b 1289 paddw m1, filter_rnd 1290 paddw m3, m4 1291 paddw m1, m2 1292 psraw m3, 4 1293 psraw m1, 4 1294 packuswb m4, m3, m1 1295 punpckhbw m2, m0, m5 1296 punpcklbw m0, m5 1297 pmullw m2, filter_y_a 1298 pmullw m1, filter_y_b 1299 paddw m2, filter_rnd 1300 pmullw m0, filter_y_a 1301 pmullw m3, filter_y_b 1302 paddw m2, m1 1303 mova m1, [refq] 1304 paddw m0, filter_rnd 1305 psraw m2, 4 1306 paddw m0, m3 1307 punpckhbw m3, m1, m5 1308 psraw m0, 4 1309 punpcklbw m1, m5 1310%endif 1311%if %2 == 1 ; avg 1312 ; FIXME(rbultje) pipeline 1313 packuswb m0, m2 1314 pavgb m0, [second_predq] 1315 punpckhbw m2, m0, m5 1316 punpcklbw m0, m5 1317%endif 1318 SUM_SSE m0, m1, m2, m3, m6, m7 1319 mova m0, m4 1320 1321 INC_SRC_BY_SRC_STRIDE 1322 add refq, ref_strideq 1323%else ; %1 < 16 1324 movx m0, [srcq] 1325 movx m1, [srcq+1] 1326%if cpuflag(ssse3) 1327 punpcklbw m0, m1 1328 pmaddubsw m0, filter_x_a 1329 paddw m0, filter_rnd 1330%else 1331 punpcklbw m0, m5 1332 punpcklbw m1, m5 1333 pmullw m0, filter_x_a 1334 pmullw m1, filter_x_b 1335 paddw m0, filter_rnd 1336 paddw m0, m1 1337%endif 1338 psraw m0, 4 1339%if cpuflag(ssse3) 1340 packuswb m0, m0 1341%endif 1342 1343 INC_SRC_BY_SRC_STRIDE 1344 1345.x_other_y_other_loop: 1346 movx m2, [srcq] 1347 movx m1, [srcq+1] 1348 1349 INC_SRC_BY_SRC_STRIDE 1350 movx m4, [srcq] 1351 movx m3, [srcq+1] 1352 1353%if cpuflag(ssse3) 1354 punpcklbw m2, m1 1355 punpcklbw m4, m3 1356 pmaddubsw m2, filter_x_a 1357 pmaddubsw m4, filter_x_a 1358 movx m3, [refq+ref_strideq] 1359 movx m1, [refq] 1360 paddw m2, filter_rnd 1361 paddw m4, filter_rnd 1362 psraw m2, 4 1363 psraw m4, 4 1364 packuswb m2, m2 1365 packuswb m4, m4 1366 punpcklbw m0, m2 1367 punpcklbw m2, m4 1368 pmaddubsw m0, filter_y_a 1369 pmaddubsw m2, filter_y_a 1370 punpcklbw m3, m5 1371 paddw m0, filter_rnd 1372 paddw m2, filter_rnd 1373 psraw m0, 4 1374 psraw m2, 4 1375 punpcklbw m1, m5 1376%else 1377 punpcklbw m2, m5 1378 punpcklbw m1, m5 1379 punpcklbw m4, m5 1380 punpcklbw m3, m5 1381 pmullw m2, filter_x_a 1382 pmullw m1, filter_x_b 1383 paddw m2, filter_rnd 1384 pmullw m4, filter_x_a 1385 pmullw m3, filter_x_b 1386 paddw m4, filter_rnd 1387 paddw m2, m1 1388 paddw m4, m3 1389 psraw m2, 4 1390 psraw m4, 4 1391 pmullw m0, filter_y_a 1392 pmullw m3, m2, filter_y_b 1393 paddw m0, filter_rnd 1394 pmullw m2, filter_y_a 1395 pmullw m1, m4, filter_y_b 1396 paddw m2, filter_rnd 1397 paddw m0, m3 1398 movx m3, [refq+ref_strideq] 1399 paddw m2, m1 1400 movx m1, [refq] 1401 psraw m0, 4 1402 psraw m2, 4 1403 punpcklbw m3, m5 1404 punpcklbw m1, m5 1405%endif 1406%if %2 == 1 ; avg 1407 ; FIXME(rbultje) pipeline 1408%if %1 == 4 1409 movlhps m0, m2 1410%endif 1411 packuswb m0, m2 1412%if %1 > 4 1413 pavgb m0, [second_predq] 1414 punpckhbw m2, m0, m5 1415 punpcklbw m0, m5 1416%else 1417 movh m2, [second_predq] 1418 pavgb m0, m2 1419 punpcklbw m0, m5 1420 movhlps m2, m0 1421%endif 1422%endif 1423 SUM_SSE m0, m1, m2, m3, m6, m7 1424 mova m0, m4 1425 1426 INC_SRC_BY_SRC_STRIDE 1427 lea refq, [refq+ref_strideq*2] 1428%endif 1429%if %2 == 1 ; avg 1430 add second_predq, second_str 1431%endif 1432 dec block_height 1433 jg .x_other_y_other_loop 1434%undef filter_x_a 1435%undef filter_x_b 1436%undef filter_y_a 1437%undef filter_y_b 1438%undef filter_rnd 1439%undef movx 1440 STORE_AND_RET %1 1441%endmacro 1442 1443; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical 1444; between the ssse3 and non-ssse3 version. It may make sense to merge their 1445; code in the sense that the ssse3 version would jump to the appropriate 1446; location in the sse/2 version, rather than duplicating that code in the 1447; binary. 1448 1449INIT_XMM sse2 1450SUBPEL_VARIANCE 4 1451SUBPEL_VARIANCE 8 1452SUBPEL_VARIANCE 16 1453 1454INIT_XMM ssse3 1455SUBPEL_VARIANCE 4 1456SUBPEL_VARIANCE 8 1457SUBPEL_VARIANCE 16 1458 1459INIT_XMM sse2 1460SUBPEL_VARIANCE 4, 1 1461SUBPEL_VARIANCE 8, 1 1462SUBPEL_VARIANCE 16, 1 1463 1464INIT_XMM ssse3 1465SUBPEL_VARIANCE 4, 1 1466SUBPEL_VARIANCE 8, 1 1467SUBPEL_VARIANCE 16, 1 1468