1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14pw_8: times 8 dw 8 15bilin_filter_m_sse2: times 8 dw 16 16 times 8 dw 0 17 times 8 dw 14 18 times 8 dw 2 19 times 8 dw 12 20 times 8 dw 4 21 times 8 dw 10 22 times 8 dw 6 23 times 16 dw 8 24 times 8 dw 6 25 times 8 dw 10 26 times 8 dw 4 27 times 8 dw 12 28 times 8 dw 2 29 times 8 dw 14 30 31SECTION .text 32 33; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, 34; int x_offset, int y_offset, 35; const uint8_t *ref, ptrdiff_t ref_stride, 36; int height, unsigned int *sse); 37; 38; This function returns the SE and stores SSE in the given pointer. 39 40%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse 41 psubw %3, %4 42 psubw %1, %2 43 mova %4, %3 ; make copies to manipulate to calc sum 44 mova %2, %1 ; use originals for calc sse 45 pmaddwd %3, %3 46 paddw %4, %2 47 pmaddwd %1, %1 48 movhlps %2, %4 49 paddd %6, %3 50 paddw %4, %2 51 pxor %2, %2 52 pcmpgtw %2, %4 ; mask for 0 > %4 (sum) 53 punpcklwd %4, %2 ; sign-extend word to dword 54 paddd %6, %1 55 paddd %5, %4 56 57%endmacro 58 59%macro STORE_AND_RET 0 60%if mmsize == 16 61 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit 62 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. 63 ; We have to sign-extend it before adding the words within the register 64 ; and outputing to a dword. 65 movhlps m3, m7 66 movhlps m4, m6 67 paddd m7, m3 68 paddd m6, m4 69 pshufd m3, m7, 0x1 70 pshufd m4, m6, 0x1 71 paddd m7, m3 72 paddd m6, m4 73 mov r1, ssem ; r1 = unsigned int *sse 74 movd [r1], m7 ; store sse 75 movd eax, m6 ; store sum as return value 76%endif 77 RET 78%endmacro 79 80%macro INC_SRC_BY_SRC_STRIDE 0 81%if VPX_ARCH_X86=1 && CONFIG_PIC=1 82 add srcq, src_stridemp 83 add srcq, src_stridemp 84%else 85 lea srcq, [srcq + src_strideq*2] 86%endif 87%endmacro 88 89%macro SUBPEL_VARIANCE 1-2 0 ; W 90%define bilin_filter_m bilin_filter_m_sse2 91%define filter_idx_shift 5 92 93 94%if VPX_ARCH_X86_64 95 %if %2 == 1 ; avg 96 cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ 97 x_offset, y_offset, \ 98 ref, ref_stride, \ 99 second_pred, second_stride, height, sse 100 %define second_str second_strideq 101 %else 102 cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ 103 x_offset, y_offset, \ 104 ref, ref_stride, height, sse 105 %endif 106 %define block_height heightd 107 %define bilin_filter sseq 108%else 109 %if CONFIG_PIC=1 110 %if %2 == 1 ; avg 111 cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 112 x_offset, y_offset, \ 113 ref, ref_stride, \ 114 second_pred, second_stride, height, sse 115 %define block_height dword heightm 116 %define second_str second_stridemp 117 %else 118 cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ 119 x_offset, y_offset, \ 120 ref, ref_stride, height, sse 121 %define block_height heightd 122 %endif 123 124 ; reuse argument stack space 125 %define g_bilin_filterm x_offsetm 126 %define g_pw_8m y_offsetm 127 128 ; Store bilin_filter and pw_8 location in stack 129 %if GET_GOT_DEFINED == 1 130 GET_GOT eax 131 add esp, 4 ; restore esp 132 %endif 133 134 lea ecx, [GLOBAL(bilin_filter_m)] 135 mov g_bilin_filterm, ecx 136 137 lea ecx, [GLOBAL(pw_8)] 138 mov g_pw_8m, ecx 139 140 LOAD_IF_USED 0, 1 ; load eax, ecx back 141 %else 142 %if %2 == 1 ; avg 143 cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 144 x_offset, y_offset, \ 145 ref, ref_stride, \ 146 second_pred, second_stride, height, sse 147 %define block_height dword heightm 148 %define second_str second_stridemp 149 %else 150 cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ 151 x_offset, y_offset, \ 152 ref, ref_stride, height, sse 153 %define block_height heightd 154 %endif 155 156 %define bilin_filter bilin_filter_m 157 %endif 158%endif 159 160 ASSERT %1 <= 16 ; m6 overflows if w > 16 161 pxor m6, m6 ; sum 162 pxor m7, m7 ; sse 163 164%if %1 < 16 165 sar block_height, 1 166%endif 167%if %2 == 1 ; avg 168 shl second_str, 1 169%endif 170 171 ; FIXME(rbultje) replace by jumptable? 172 test x_offsetd, x_offsetd 173 jnz .x_nonzero 174 ; x_offset == 0 175 test y_offsetd, y_offsetd 176 jnz .x_zero_y_nonzero 177 178 ; x_offset == 0 && y_offset == 0 179.x_zero_y_zero_loop: 180%if %1 == 16 181 movu m0, [srcq] 182 movu m2, [srcq + 16] 183 mova m1, [refq] 184 mova m3, [refq + 16] 185%if %2 == 1 ; avg 186 pavgw m0, [second_predq] 187 pavgw m2, [second_predq+16] 188%endif 189 SUM_SSE m0, m1, m2, m3, m6, m7 190 191 lea srcq, [srcq + src_strideq*2] 192 lea refq, [refq + ref_strideq*2] 193%if %2 == 1 ; avg 194 add second_predq, second_str 195%endif 196%else ; %1 < 16 197 movu m0, [srcq] 198 movu m2, [srcq + src_strideq*2] 199 mova m1, [refq] 200 mova m3, [refq + ref_strideq*2] 201%if %2 == 1 ; avg 202 pavgw m0, [second_predq] 203 add second_predq, second_str 204 pavgw m2, [second_predq] 205%endif 206 SUM_SSE m0, m1, m2, m3, m6, m7 207 208 lea srcq, [srcq + src_strideq*4] 209 lea refq, [refq + ref_strideq*4] 210%if %2 == 1 ; avg 211 add second_predq, second_str 212%endif 213%endif 214 dec block_height 215 jg .x_zero_y_zero_loop 216 STORE_AND_RET 217 218.x_zero_y_nonzero: 219 cmp y_offsetd, 8 220 jne .x_zero_y_nonhalf 221 222 ; x_offset == 0 && y_offset == 0.5 223.x_zero_y_half_loop: 224%if %1 == 16 225 movu m0, [srcq] 226 movu m1, [srcq+16] 227 movu m4, [srcq+src_strideq*2] 228 movu m5, [srcq+src_strideq*2+16] 229 mova m2, [refq] 230 mova m3, [refq+16] 231 pavgw m0, m4 232 pavgw m1, m5 233%if %2 == 1 ; avg 234 pavgw m0, [second_predq] 235 pavgw m1, [second_predq+16] 236%endif 237 SUM_SSE m0, m2, m1, m3, m6, m7 238 239 lea srcq, [srcq + src_strideq*2] 240 lea refq, [refq + ref_strideq*2] 241%if %2 == 1 ; avg 242 add second_predq, second_str 243%endif 244%else ; %1 < 16 245 movu m0, [srcq] 246 movu m1, [srcq+src_strideq*2] 247 movu m5, [srcq+src_strideq*4] 248 mova m2, [refq] 249 mova m3, [refq+ref_strideq*2] 250 pavgw m0, m1 251 pavgw m1, m5 252%if %2 == 1 ; avg 253 pavgw m0, [second_predq] 254 add second_predq, second_str 255 pavgw m1, [second_predq] 256%endif 257 SUM_SSE m0, m2, m1, m3, m6, m7 258 259 lea srcq, [srcq + src_strideq*4] 260 lea refq, [refq + ref_strideq*4] 261%if %2 == 1 ; avg 262 add second_predq, second_str 263%endif 264%endif 265 dec block_height 266 jg .x_zero_y_half_loop 267 STORE_AND_RET 268 269.x_zero_y_nonhalf: 270 ; x_offset == 0 && y_offset == bilin interpolation 271%if VPX_ARCH_X86_64 272 lea bilin_filter, [GLOBAL(bilin_filter_m)] 273%endif 274 shl y_offsetd, filter_idx_shift 275%if VPX_ARCH_X86_64 && mmsize == 16 276 mova m8, [bilin_filter+y_offsetq] 277 mova m9, [bilin_filter+y_offsetq+16] 278 mova m10, [GLOBAL(pw_8)] 279%define filter_y_a m8 280%define filter_y_b m9 281%define filter_rnd m10 282%else ; x86-32 or mmx 283%if VPX_ARCH_X86=1 && CONFIG_PIC=1 284; x_offset == 0, reuse x_offset reg 285%define tempq x_offsetq 286 add y_offsetq, g_bilin_filterm 287%define filter_y_a [y_offsetq] 288%define filter_y_b [y_offsetq+16] 289 mov tempq, g_pw_8m 290%define filter_rnd [tempq] 291%else 292 add y_offsetq, bilin_filter 293%define filter_y_a [y_offsetq] 294%define filter_y_b [y_offsetq+16] 295%define filter_rnd [GLOBAL(pw_8)] 296%endif 297%endif 298 299.x_zero_y_other_loop: 300%if %1 == 16 301 movu m0, [srcq] 302 movu m1, [srcq + 16] 303 movu m4, [srcq+src_strideq*2] 304 movu m5, [srcq+src_strideq*2+16] 305 mova m2, [refq] 306 mova m3, [refq+16] 307 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can 308 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of 309 ; instructions is the same (5), but it is 1 mul instead of 2, so might be 310 ; slightly faster because of pmullw latency. It would also cut our rodata 311 ; tables in half for this function, and save 1-2 registers on x86-64. 312 pmullw m1, filter_y_a 313 pmullw m5, filter_y_b 314 paddw m1, filter_rnd 315 pmullw m0, filter_y_a 316 pmullw m4, filter_y_b 317 paddw m0, filter_rnd 318 paddw m1, m5 319 paddw m0, m4 320 psrlw m1, 4 321 psrlw m0, 4 322%if %2 == 1 ; avg 323 pavgw m0, [second_predq] 324 pavgw m1, [second_predq+16] 325%endif 326 SUM_SSE m0, m2, m1, m3, m6, m7 327 328 lea srcq, [srcq + src_strideq*2] 329 lea refq, [refq + ref_strideq*2] 330%if %2 == 1 ; avg 331 add second_predq, second_str 332%endif 333%else ; %1 < 16 334 movu m0, [srcq] 335 movu m1, [srcq+src_strideq*2] 336 movu m5, [srcq+src_strideq*4] 337 mova m4, m1 338 mova m2, [refq] 339 mova m3, [refq+ref_strideq*2] 340 pmullw m1, filter_y_a 341 pmullw m5, filter_y_b 342 paddw m1, filter_rnd 343 pmullw m0, filter_y_a 344 pmullw m4, filter_y_b 345 paddw m0, filter_rnd 346 paddw m1, m5 347 paddw m0, m4 348 psrlw m1, 4 349 psrlw m0, 4 350%if %2 == 1 ; avg 351 pavgw m0, [second_predq] 352 add second_predq, second_str 353 pavgw m1, [second_predq] 354%endif 355 SUM_SSE m0, m2, m1, m3, m6, m7 356 357 lea srcq, [srcq + src_strideq*4] 358 lea refq, [refq + ref_strideq*4] 359%if %2 == 1 ; avg 360 add second_predq, second_str 361%endif 362%endif 363 dec block_height 364 jg .x_zero_y_other_loop 365%undef filter_y_a 366%undef filter_y_b 367%undef filter_rnd 368 STORE_AND_RET 369 370.x_nonzero: 371 cmp x_offsetd, 8 372 jne .x_nonhalf 373 ; x_offset == 0.5 374 test y_offsetd, y_offsetd 375 jnz .x_half_y_nonzero 376 377 ; x_offset == 0.5 && y_offset == 0 378.x_half_y_zero_loop: 379%if %1 == 16 380 movu m0, [srcq] 381 movu m1, [srcq + 16] 382 movu m4, [srcq + 2] 383 movu m5, [srcq + 18] 384 mova m2, [refq] 385 mova m3, [refq + 16] 386 pavgw m0, m4 387 pavgw m1, m5 388%if %2 == 1 ; avg 389 pavgw m0, [second_predq] 390 pavgw m1, [second_predq+16] 391%endif 392 SUM_SSE m0, m2, m1, m3, m6, m7 393 394 lea srcq, [srcq + src_strideq*2] 395 lea refq, [refq + ref_strideq*2] 396%if %2 == 1 ; avg 397 add second_predq, second_str 398%endif 399%else ; %1 < 16 400 movu m0, [srcq] 401 movu m1, [srcq + src_strideq*2] 402 movu m4, [srcq + 2] 403 movu m5, [srcq + src_strideq*2 + 2] 404 mova m2, [refq] 405 mova m3, [refq + ref_strideq*2] 406 pavgw m0, m4 407 pavgw m1, m5 408%if %2 == 1 ; avg 409 pavgw m0, [second_predq] 410 add second_predq, second_str 411 pavgw m1, [second_predq] 412%endif 413 SUM_SSE m0, m2, m1, m3, m6, m7 414 415 lea srcq, [srcq + src_strideq*4] 416 lea refq, [refq + ref_strideq*4] 417%if %2 == 1 ; avg 418 add second_predq, second_str 419%endif 420%endif 421 dec block_height 422 jg .x_half_y_zero_loop 423 STORE_AND_RET 424 425.x_half_y_nonzero: 426 cmp y_offsetd, 8 427 jne .x_half_y_nonhalf 428 429 ; x_offset == 0.5 && y_offset == 0.5 430%if %1 == 16 431 movu m0, [srcq] 432 movu m1, [srcq+16] 433 movu m2, [srcq+2] 434 movu m3, [srcq+18] 435 lea srcq, [srcq + src_strideq*2] 436 pavgw m0, m2 437 pavgw m1, m3 438.x_half_y_half_loop: 439 movu m2, [srcq] 440 movu m3, [srcq + 16] 441 movu m4, [srcq + 2] 442 movu m5, [srcq + 18] 443 pavgw m2, m4 444 pavgw m3, m5 445 pavgw m0, m2 446 pavgw m1, m3 447 mova m4, [refq] 448 mova m5, [refq + 16] 449%if %2 == 1 ; avg 450 pavgw m0, [second_predq] 451 pavgw m1, [second_predq+16] 452%endif 453 SUM_SSE m0, m4, m1, m5, m6, m7 454 mova m0, m2 455 mova m1, m3 456 457 lea srcq, [srcq + src_strideq*2] 458 lea refq, [refq + ref_strideq*2] 459%if %2 == 1 ; avg 460 add second_predq, second_str 461%endif 462%else ; %1 < 16 463 movu m0, [srcq] 464 movu m2, [srcq+2] 465 lea srcq, [srcq + src_strideq*2] 466 pavgw m0, m2 467.x_half_y_half_loop: 468 movu m2, [srcq] 469 movu m3, [srcq + src_strideq*2] 470 movu m4, [srcq + 2] 471 movu m5, [srcq + src_strideq*2 + 2] 472 pavgw m2, m4 473 pavgw m3, m5 474 pavgw m0, m2 475 pavgw m2, m3 476 mova m4, [refq] 477 mova m5, [refq + ref_strideq*2] 478%if %2 == 1 ; avg 479 pavgw m0, [second_predq] 480 add second_predq, second_str 481 pavgw m2, [second_predq] 482%endif 483 SUM_SSE m0, m4, m2, m5, m6, m7 484 mova m0, m3 485 486 lea srcq, [srcq + src_strideq*4] 487 lea refq, [refq + ref_strideq*4] 488%if %2 == 1 ; avg 489 add second_predq, second_str 490%endif 491%endif 492 dec block_height 493 jg .x_half_y_half_loop 494 STORE_AND_RET 495 496.x_half_y_nonhalf: 497 ; x_offset == 0.5 && y_offset == bilin interpolation 498%if VPX_ARCH_X86_64 499 lea bilin_filter, [GLOBAL(bilin_filter_m)] 500%endif 501 shl y_offsetd, filter_idx_shift 502%if VPX_ARCH_X86_64 && mmsize == 16 503 mova m8, [bilin_filter+y_offsetq] 504 mova m9, [bilin_filter+y_offsetq+16] 505 mova m10, [GLOBAL(pw_8)] 506%define filter_y_a m8 507%define filter_y_b m9 508%define filter_rnd m10 509%else ; x86_32 510%if VPX_ARCH_X86=1 && CONFIG_PIC=1 511; x_offset == 0.5. We can reuse x_offset reg 512%define tempq x_offsetq 513 add y_offsetq, g_bilin_filterm 514%define filter_y_a [y_offsetq] 515%define filter_y_b [y_offsetq+16] 516 mov tempq, g_pw_8m 517%define filter_rnd [tempq] 518%else 519 add y_offsetq, bilin_filter 520%define filter_y_a [y_offsetq] 521%define filter_y_b [y_offsetq+16] 522%define filter_rnd [GLOBAL(pw_8)] 523%endif 524%endif 525 526%if %1 == 16 527 movu m0, [srcq] 528 movu m1, [srcq+16] 529 movu m2, [srcq+2] 530 movu m3, [srcq+18] 531 lea srcq, [srcq + src_strideq*2] 532 pavgw m0, m2 533 pavgw m1, m3 534.x_half_y_other_loop: 535 movu m2, [srcq] 536 movu m3, [srcq+16] 537 movu m4, [srcq+2] 538 movu m5, [srcq+18] 539 pavgw m2, m4 540 pavgw m3, m5 541 mova m4, m2 542 mova m5, m3 543 pmullw m1, filter_y_a 544 pmullw m3, filter_y_b 545 paddw m1, filter_rnd 546 paddw m1, m3 547 pmullw m0, filter_y_a 548 pmullw m2, filter_y_b 549 paddw m0, filter_rnd 550 psrlw m1, 4 551 paddw m0, m2 552 mova m2, [refq] 553 psrlw m0, 4 554 mova m3, [refq+16] 555%if %2 == 1 ; avg 556 pavgw m0, [second_predq] 557 pavgw m1, [second_predq+16] 558%endif 559 SUM_SSE m0, m2, m1, m3, m6, m7 560 mova m0, m4 561 mova m1, m5 562 563 lea srcq, [srcq + src_strideq*2] 564 lea refq, [refq + ref_strideq*2] 565%if %2 == 1 ; avg 566 add second_predq, second_str 567%endif 568%else ; %1 < 16 569 movu m0, [srcq] 570 movu m2, [srcq+2] 571 lea srcq, [srcq + src_strideq*2] 572 pavgw m0, m2 573.x_half_y_other_loop: 574 movu m2, [srcq] 575 movu m3, [srcq+src_strideq*2] 576 movu m4, [srcq+2] 577 movu m5, [srcq+src_strideq*2+2] 578 pavgw m2, m4 579 pavgw m3, m5 580 mova m4, m2 581 mova m5, m3 582 pmullw m4, filter_y_a 583 pmullw m3, filter_y_b 584 paddw m4, filter_rnd 585 paddw m4, m3 586 pmullw m0, filter_y_a 587 pmullw m2, filter_y_b 588 paddw m0, filter_rnd 589 psrlw m4, 4 590 paddw m0, m2 591 mova m2, [refq] 592 psrlw m0, 4 593 mova m3, [refq+ref_strideq*2] 594%if %2 == 1 ; avg 595 pavgw m0, [second_predq] 596 add second_predq, second_str 597 pavgw m4, [second_predq] 598%endif 599 SUM_SSE m0, m2, m4, m3, m6, m7 600 mova m0, m5 601 602 lea srcq, [srcq + src_strideq*4] 603 lea refq, [refq + ref_strideq*4] 604%if %2 == 1 ; avg 605 add second_predq, second_str 606%endif 607%endif 608 dec block_height 609 jg .x_half_y_other_loop 610%undef filter_y_a 611%undef filter_y_b 612%undef filter_rnd 613 STORE_AND_RET 614 615.x_nonhalf: 616 test y_offsetd, y_offsetd 617 jnz .x_nonhalf_y_nonzero 618 619 ; x_offset == bilin interpolation && y_offset == 0 620%if VPX_ARCH_X86_64 621 lea bilin_filter, [GLOBAL(bilin_filter_m)] 622%endif 623 shl x_offsetd, filter_idx_shift 624%if VPX_ARCH_X86_64 && mmsize == 16 625 mova m8, [bilin_filter+x_offsetq] 626 mova m9, [bilin_filter+x_offsetq+16] 627 mova m10, [GLOBAL(pw_8)] 628%define filter_x_a m8 629%define filter_x_b m9 630%define filter_rnd m10 631%else ; x86-32 632%if VPX_ARCH_X86=1 && CONFIG_PIC=1 633; y_offset == 0. We can reuse y_offset reg. 634%define tempq y_offsetq 635 add x_offsetq, g_bilin_filterm 636%define filter_x_a [x_offsetq] 637%define filter_x_b [x_offsetq+16] 638 mov tempq, g_pw_8m 639%define filter_rnd [tempq] 640%else 641 add x_offsetq, bilin_filter 642%define filter_x_a [x_offsetq] 643%define filter_x_b [x_offsetq+16] 644%define filter_rnd [GLOBAL(pw_8)] 645%endif 646%endif 647 648.x_other_y_zero_loop: 649%if %1 == 16 650 movu m0, [srcq] 651 movu m1, [srcq+16] 652 movu m2, [srcq+2] 653 movu m3, [srcq+18] 654 mova m4, [refq] 655 mova m5, [refq+16] 656 pmullw m1, filter_x_a 657 pmullw m3, filter_x_b 658 paddw m1, filter_rnd 659 pmullw m0, filter_x_a 660 pmullw m2, filter_x_b 661 paddw m0, filter_rnd 662 paddw m1, m3 663 paddw m0, m2 664 psrlw m1, 4 665 psrlw m0, 4 666%if %2 == 1 ; avg 667 pavgw m0, [second_predq] 668 pavgw m1, [second_predq+16] 669%endif 670 SUM_SSE m0, m4, m1, m5, m6, m7 671 672 lea srcq, [srcq+src_strideq*2] 673 lea refq, [refq+ref_strideq*2] 674%if %2 == 1 ; avg 675 add second_predq, second_str 676%endif 677%else ; %1 < 16 678 movu m0, [srcq] 679 movu m1, [srcq+src_strideq*2] 680 movu m2, [srcq+2] 681 movu m3, [srcq+src_strideq*2+2] 682 mova m4, [refq] 683 mova m5, [refq+ref_strideq*2] 684 pmullw m1, filter_x_a 685 pmullw m3, filter_x_b 686 paddw m1, filter_rnd 687 pmullw m0, filter_x_a 688 pmullw m2, filter_x_b 689 paddw m0, filter_rnd 690 paddw m1, m3 691 paddw m0, m2 692 psrlw m1, 4 693 psrlw m0, 4 694%if %2 == 1 ; avg 695 pavgw m0, [second_predq] 696 add second_predq, second_str 697 pavgw m1, [second_predq] 698%endif 699 SUM_SSE m0, m4, m1, m5, m6, m7 700 701 lea srcq, [srcq+src_strideq*4] 702 lea refq, [refq+ref_strideq*4] 703%if %2 == 1 ; avg 704 add second_predq, second_str 705%endif 706%endif 707 dec block_height 708 jg .x_other_y_zero_loop 709%undef filter_x_a 710%undef filter_x_b 711%undef filter_rnd 712 STORE_AND_RET 713 714.x_nonhalf_y_nonzero: 715 cmp y_offsetd, 8 716 jne .x_nonhalf_y_nonhalf 717 718 ; x_offset == bilin interpolation && y_offset == 0.5 719%if VPX_ARCH_X86_64 720 lea bilin_filter, [GLOBAL(bilin_filter_m)] 721%endif 722 shl x_offsetd, filter_idx_shift 723%if VPX_ARCH_X86_64 && mmsize == 16 724 mova m8, [bilin_filter+x_offsetq] 725 mova m9, [bilin_filter+x_offsetq+16] 726 mova m10, [GLOBAL(pw_8)] 727%define filter_x_a m8 728%define filter_x_b m9 729%define filter_rnd m10 730%else ; x86-32 731%if VPX_ARCH_X86=1 && CONFIG_PIC=1 732; y_offset == 0.5. We can reuse y_offset reg. 733%define tempq y_offsetq 734 add x_offsetq, g_bilin_filterm 735%define filter_x_a [x_offsetq] 736%define filter_x_b [x_offsetq+16] 737 mov tempq, g_pw_8m 738%define filter_rnd [tempq] 739%else 740 add x_offsetq, bilin_filter 741%define filter_x_a [x_offsetq] 742%define filter_x_b [x_offsetq+16] 743%define filter_rnd [GLOBAL(pw_8)] 744%endif 745%endif 746 747%if %1 == 16 748 movu m0, [srcq] 749 movu m1, [srcq+16] 750 movu m2, [srcq+2] 751 movu m3, [srcq+18] 752 pmullw m0, filter_x_a 753 pmullw m2, filter_x_b 754 paddw m0, filter_rnd 755 pmullw m1, filter_x_a 756 pmullw m3, filter_x_b 757 paddw m1, filter_rnd 758 paddw m0, m2 759 paddw m1, m3 760 psrlw m0, 4 761 psrlw m1, 4 762 lea srcq, [srcq+src_strideq*2] 763.x_other_y_half_loop: 764 movu m2, [srcq] 765 movu m3, [srcq+16] 766 movu m4, [srcq+2] 767 movu m5, [srcq+18] 768 pmullw m2, filter_x_a 769 pmullw m4, filter_x_b 770 paddw m2, filter_rnd 771 pmullw m3, filter_x_a 772 pmullw m5, filter_x_b 773 paddw m3, filter_rnd 774 paddw m2, m4 775 paddw m3, m5 776 mova m4, [refq] 777 mova m5, [refq+16] 778 psrlw m2, 4 779 psrlw m3, 4 780 pavgw m0, m2 781 pavgw m1, m3 782%if %2 == 1 ; avg 783 pavgw m0, [second_predq] 784 pavgw m1, [second_predq+16] 785%endif 786 SUM_SSE m0, m4, m1, m5, m6, m7 787 mova m0, m2 788 mova m1, m3 789 790 lea srcq, [srcq+src_strideq*2] 791 lea refq, [refq+ref_strideq*2] 792%if %2 == 1 ; avg 793 add second_predq, second_str 794%endif 795%else ; %1 < 16 796 movu m0, [srcq] 797 movu m2, [srcq+2] 798 pmullw m0, filter_x_a 799 pmullw m2, filter_x_b 800 paddw m0, filter_rnd 801 paddw m0, m2 802 psrlw m0, 4 803 lea srcq, [srcq+src_strideq*2] 804.x_other_y_half_loop: 805 movu m2, [srcq] 806 movu m3, [srcq+src_strideq*2] 807 movu m4, [srcq+2] 808 movu m5, [srcq+src_strideq*2+2] 809 pmullw m2, filter_x_a 810 pmullw m4, filter_x_b 811 paddw m2, filter_rnd 812 pmullw m3, filter_x_a 813 pmullw m5, filter_x_b 814 paddw m3, filter_rnd 815 paddw m2, m4 816 paddw m3, m5 817 mova m4, [refq] 818 mova m5, [refq+ref_strideq*2] 819 psrlw m2, 4 820 psrlw m3, 4 821 pavgw m0, m2 822 pavgw m2, m3 823%if %2 == 1 ; avg 824 pavgw m0, [second_predq] 825 add second_predq, second_str 826 pavgw m2, [second_predq] 827%endif 828 SUM_SSE m0, m4, m2, m5, m6, m7 829 mova m0, m3 830 831 lea srcq, [srcq+src_strideq*4] 832 lea refq, [refq+ref_strideq*4] 833%if %2 == 1 ; avg 834 add second_predq, second_str 835%endif 836%endif 837 dec block_height 838 jg .x_other_y_half_loop 839%undef filter_x_a 840%undef filter_x_b 841%undef filter_rnd 842 STORE_AND_RET 843 844.x_nonhalf_y_nonhalf: 845; loading filter - this is same as in 8-bit depth 846%if VPX_ARCH_X86_64 847 lea bilin_filter, [GLOBAL(bilin_filter_m)] 848%endif 849 shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 850 shl y_offsetd, filter_idx_shift 851%if VPX_ARCH_X86_64 && mmsize == 16 852 mova m8, [bilin_filter+x_offsetq] 853 mova m9, [bilin_filter+x_offsetq+16] 854 mova m10, [bilin_filter+y_offsetq] 855 mova m11, [bilin_filter+y_offsetq+16] 856 mova m12, [GLOBAL(pw_8)] 857%define filter_x_a m8 858%define filter_x_b m9 859%define filter_y_a m10 860%define filter_y_b m11 861%define filter_rnd m12 862%else ; x86-32 863%if VPX_ARCH_X86=1 && CONFIG_PIC=1 864; In this case, there is NO unused register. Used src_stride register. Later, 865; src_stride has to be loaded from stack when it is needed. 866%define tempq src_strideq 867 mov tempq, g_bilin_filterm 868 add x_offsetq, tempq 869 add y_offsetq, tempq 870%define filter_x_a [x_offsetq] 871%define filter_x_b [x_offsetq+16] 872%define filter_y_a [y_offsetq] 873%define filter_y_b [y_offsetq+16] 874 875 mov tempq, g_pw_8m 876%define filter_rnd [tempq] 877%else 878 add x_offsetq, bilin_filter 879 add y_offsetq, bilin_filter 880%define filter_x_a [x_offsetq] 881%define filter_x_b [x_offsetq+16] 882%define filter_y_a [y_offsetq] 883%define filter_y_b [y_offsetq+16] 884%define filter_rnd [GLOBAL(pw_8)] 885%endif 886%endif 887; end of load filter 888 889 ; x_offset == bilin interpolation && y_offset == bilin interpolation 890%if %1 == 16 891 movu m0, [srcq] 892 movu m2, [srcq+2] 893 movu m1, [srcq+16] 894 movu m3, [srcq+18] 895 pmullw m0, filter_x_a 896 pmullw m2, filter_x_b 897 paddw m0, filter_rnd 898 pmullw m1, filter_x_a 899 pmullw m3, filter_x_b 900 paddw m1, filter_rnd 901 paddw m0, m2 902 paddw m1, m3 903 psrlw m0, 4 904 psrlw m1, 4 905 906 INC_SRC_BY_SRC_STRIDE 907 908.x_other_y_other_loop: 909 movu m2, [srcq] 910 movu m4, [srcq+2] 911 movu m3, [srcq+16] 912 movu m5, [srcq+18] 913 pmullw m2, filter_x_a 914 pmullw m4, filter_x_b 915 paddw m2, filter_rnd 916 pmullw m3, filter_x_a 917 pmullw m5, filter_x_b 918 paddw m3, filter_rnd 919 paddw m2, m4 920 paddw m3, m5 921 psrlw m2, 4 922 psrlw m3, 4 923 mova m4, m2 924 mova m5, m3 925 pmullw m0, filter_y_a 926 pmullw m2, filter_y_b 927 paddw m0, filter_rnd 928 pmullw m1, filter_y_a 929 pmullw m3, filter_y_b 930 paddw m0, m2 931 paddw m1, filter_rnd 932 mova m2, [refq] 933 paddw m1, m3 934 psrlw m0, 4 935 psrlw m1, 4 936 mova m3, [refq+16] 937%if %2 == 1 ; avg 938 pavgw m0, [second_predq] 939 pavgw m1, [second_predq+16] 940%endif 941 SUM_SSE m0, m2, m1, m3, m6, m7 942 mova m0, m4 943 mova m1, m5 944 945 INC_SRC_BY_SRC_STRIDE 946 lea refq, [refq + ref_strideq * 2] 947%if %2 == 1 ; avg 948 add second_predq, second_str 949%endif 950%else ; %1 < 16 951 movu m0, [srcq] 952 movu m2, [srcq+2] 953 pmullw m0, filter_x_a 954 pmullw m2, filter_x_b 955 paddw m0, filter_rnd 956 paddw m0, m2 957 psrlw m0, 4 958 959 INC_SRC_BY_SRC_STRIDE 960 961.x_other_y_other_loop: 962 movu m2, [srcq] 963 movu m4, [srcq+2] 964 INC_SRC_BY_SRC_STRIDE 965 movu m3, [srcq] 966 movu m5, [srcq+2] 967 pmullw m2, filter_x_a 968 pmullw m4, filter_x_b 969 paddw m2, filter_rnd 970 pmullw m3, filter_x_a 971 pmullw m5, filter_x_b 972 paddw m3, filter_rnd 973 paddw m2, m4 974 paddw m3, m5 975 psrlw m2, 4 976 psrlw m3, 4 977 mova m4, m2 978 mova m5, m3 979 pmullw m0, filter_y_a 980 pmullw m2, filter_y_b 981 paddw m0, filter_rnd 982 pmullw m4, filter_y_a 983 pmullw m3, filter_y_b 984 paddw m0, m2 985 paddw m4, filter_rnd 986 mova m2, [refq] 987 paddw m4, m3 988 psrlw m0, 4 989 psrlw m4, 4 990 mova m3, [refq+ref_strideq*2] 991%if %2 == 1 ; avg 992 pavgw m0, [second_predq] 993 add second_predq, second_str 994 pavgw m4, [second_predq] 995%endif 996 SUM_SSE m0, m2, m4, m3, m6, m7 997 mova m0, m5 998 999 INC_SRC_BY_SRC_STRIDE 1000 lea refq, [refq + ref_strideq * 4] 1001%if %2 == 1 ; avg 1002 add second_predq, second_str 1003%endif 1004%endif 1005 dec block_height 1006 jg .x_other_y_other_loop 1007%undef filter_x_a 1008%undef filter_x_b 1009%undef filter_y_a 1010%undef filter_y_b 1011%undef filter_rnd 1012 STORE_AND_RET 1013%endmacro 1014 1015INIT_XMM sse2 1016SUBPEL_VARIANCE 8 1017SUBPEL_VARIANCE 16 1018 1019INIT_XMM sse2 1020SUBPEL_VARIANCE 8, 1 1021SUBPEL_VARIANCE 16, 1 1022