1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13extern sym(vp8_bilinear_filters_x86_8) 14 15%define BLOCK_HEIGHT_WIDTH 4 16%define VP8_FILTER_WEIGHT 128 17%define VP8_FILTER_SHIFT 7 18 19SECTION .text 20 21;/************************************************************************************ 22; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 23; input pixel array has output_height rows. This routine assumes that output_height is an 24; even number. This function handles 8 pixels in horizontal direction, calculating ONE 25; rows each iteration to take advantage of the 128 bits operations. 26;*************************************************************************************/ 27;void vp8_filter_block1d8_h6_sse2 28;( 29; unsigned char *src_ptr, 30; unsigned short *output_ptr, 31; unsigned int src_pixels_per_line, 32; unsigned int pixel_step, 33; unsigned int output_height, 34; unsigned int output_width, 35; short *vp8_filter 36;) 37global sym(vp8_filter_block1d8_h6_sse2) PRIVATE 38sym(vp8_filter_block1d8_h6_sse2): 39 push rbp 40 mov rbp, rsp 41 SHADOW_ARGS_TO_STACK 7 42 SAVE_XMM 7 43 GET_GOT rbx 44 push rsi 45 push rdi 46 ; end prolog 47 48 mov rdx, arg(6) ;vp8_filter 49 mov rsi, arg(0) ;src_ptr 50 51 mov rdi, arg(1) ;output_ptr 52 53 movsxd rcx, dword ptr arg(4) ;output_height 54 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 55%if ABI_IS_32BIT=0 56 movsxd r8, dword ptr arg(5) ;output_width 57%endif 58 pxor xmm0, xmm0 ; clear xmm0 for unpack 59 60.filter_block1d8_h6_rowloop: 61 movq xmm3, MMWORD PTR [rsi - 2] 62 movq xmm1, MMWORD PTR [rsi + 6] 63 64 prefetcht2 [rsi+rax-2] 65 66 pslldq xmm1, 8 67 por xmm1, xmm3 68 69 movdqa xmm4, xmm1 70 movdqa xmm5, xmm1 71 72 movdqa xmm6, xmm1 73 movdqa xmm7, xmm1 74 75 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 76 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 77 78 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 79 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 80 81 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 82 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 83 84 85 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 86 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 87 88 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 89 90 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 91 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 92 93 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 94 95 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 96 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 97 98 99 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 100 101 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 102 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 103 104 105 paddsw xmm4, xmm7 106 paddsw xmm4, xmm5 107 108 paddsw xmm4, xmm3 109 paddsw xmm4, xmm6 110 111 paddsw xmm4, xmm1 112 paddsw xmm4, [GLOBAL(rd)] 113 114 psraw xmm4, 7 115 116 packuswb xmm4, xmm0 117 punpcklbw xmm4, xmm0 118 119 movdqa XMMWORD Ptr [rdi], xmm4 120 lea rsi, [rsi + rax] 121 122%if ABI_IS_32BIT 123 add rdi, DWORD Ptr arg(5) ;[output_width] 124%else 125 add rdi, r8 126%endif 127 dec rcx 128 129 jnz .filter_block1d8_h6_rowloop ; next row 130 131 ; begin epilog 132 pop rdi 133 pop rsi 134 RESTORE_GOT 135 RESTORE_XMM 136 UNSHADOW_ARGS 137 pop rbp 138 ret 139 140 141;void vp8_filter_block1d16_h6_sse2 142;( 143; unsigned char *src_ptr, 144; unsigned short *output_ptr, 145; unsigned int src_pixels_per_line, 146; unsigned int pixel_step, 147; unsigned int output_height, 148; unsigned int output_width, 149; short *vp8_filter 150;) 151;/************************************************************************************ 152; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 153; input pixel array has output_height rows. This routine assumes that output_height is an 154; even number. This function handles 8 pixels in horizontal direction, calculating ONE 155; rows each iteration to take advantage of the 128 bits operations. 156;*************************************************************************************/ 157global sym(vp8_filter_block1d16_h6_sse2) PRIVATE 158sym(vp8_filter_block1d16_h6_sse2): 159 push rbp 160 mov rbp, rsp 161 SHADOW_ARGS_TO_STACK 7 162 SAVE_XMM 7 163 GET_GOT rbx 164 push rsi 165 push rdi 166 ; end prolog 167 168 mov rdx, arg(6) ;vp8_filter 169 mov rsi, arg(0) ;src_ptr 170 171 mov rdi, arg(1) ;output_ptr 172 173 movsxd rcx, dword ptr arg(4) ;output_height 174 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 175%if ABI_IS_32BIT=0 176 movsxd r8, dword ptr arg(5) ;output_width 177%endif 178 179 pxor xmm0, xmm0 ; clear xmm0 for unpack 180 181.filter_block1d16_h6_sse2_rowloop: 182 movq xmm3, MMWORD PTR [rsi - 2] 183 movq xmm1, MMWORD PTR [rsi + 6] 184 185 ; Load from 11 to avoid reading out of bounds. 186 movq xmm2, MMWORD PTR [rsi +11] 187 ; The lower bits are not cleared before 'or'ing with xmm1, 188 ; but that is OK because the values in the overlapping positions 189 ; are already equal to the ones in xmm1. 190 pslldq xmm2, 5 191 192 por xmm2, xmm1 193 prefetcht2 [rsi+rax-2] 194 195 pslldq xmm1, 8 196 por xmm1, xmm3 197 198 movdqa xmm4, xmm1 199 movdqa xmm5, xmm1 200 201 movdqa xmm6, xmm1 202 movdqa xmm7, xmm1 203 204 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 205 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 206 207 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 208 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 209 210 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 211 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 212 213 214 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 215 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 216 217 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 218 219 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 220 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 221 222 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 223 224 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 225 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 226 227 228 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 229 230 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 231 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 232 233 paddsw xmm4, xmm7 234 paddsw xmm4, xmm5 235 236 paddsw xmm4, xmm3 237 paddsw xmm4, xmm6 238 239 paddsw xmm4, xmm1 240 paddsw xmm4, [GLOBAL(rd)] 241 242 psraw xmm4, 7 243 244 packuswb xmm4, xmm0 245 punpcklbw xmm4, xmm0 246 247 movdqa XMMWORD Ptr [rdi], xmm4 248 249 movdqa xmm3, xmm2 250 movdqa xmm4, xmm2 251 252 movdqa xmm5, xmm2 253 movdqa xmm6, xmm2 254 255 movdqa xmm7, xmm2 256 257 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 258 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 259 260 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 261 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 262 263 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 264 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 265 266 267 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 268 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 269 270 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 271 272 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 273 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 274 275 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 276 277 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 278 psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 279 280 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 281 282 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 283 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 284 285 286 paddsw xmm4, xmm7 287 paddsw xmm4, xmm5 288 289 paddsw xmm4, xmm3 290 paddsw xmm4, xmm6 291 292 paddsw xmm4, xmm2 293 paddsw xmm4, [GLOBAL(rd)] 294 295 psraw xmm4, 7 296 297 packuswb xmm4, xmm0 298 punpcklbw xmm4, xmm0 299 300 movdqa XMMWORD Ptr [rdi+16], xmm4 301 302 lea rsi, [rsi + rax] 303%if ABI_IS_32BIT 304 add rdi, DWORD Ptr arg(5) ;[output_width] 305%else 306 add rdi, r8 307%endif 308 309 dec rcx 310 jnz .filter_block1d16_h6_sse2_rowloop ; next row 311 312 ; begin epilog 313 pop rdi 314 pop rsi 315 RESTORE_GOT 316 RESTORE_XMM 317 UNSHADOW_ARGS 318 pop rbp 319 ret 320 321 322;void vp8_filter_block1d8_v6_sse2 323;( 324; short *src_ptr, 325; unsigned char *output_ptr, 326; int dst_ptich, 327; unsigned int pixels_per_line, 328; unsigned int pixel_step, 329; unsigned int output_height, 330; unsigned int output_width, 331; short * vp8_filter 332;) 333;/************************************************************************************ 334; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The 335; input pixel array has output_height rows. 336;*************************************************************************************/ 337global sym(vp8_filter_block1d8_v6_sse2) PRIVATE 338sym(vp8_filter_block1d8_v6_sse2): 339 push rbp 340 mov rbp, rsp 341 SHADOW_ARGS_TO_STACK 8 342 SAVE_XMM 7 343 GET_GOT rbx 344 push rsi 345 push rdi 346 ; end prolog 347 348 mov rax, arg(7) ;vp8_filter 349 movsxd rdx, dword ptr arg(3) ;pixels_per_line 350 351 mov rdi, arg(1) ;output_ptr 352 mov rsi, arg(0) ;src_ptr 353 354 sub rsi, rdx 355 sub rsi, rdx 356 357 movsxd rcx, DWORD PTR arg(5) ;[output_height] 358 pxor xmm0, xmm0 ; clear xmm0 359 360 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 361%if ABI_IS_32BIT=0 362 movsxd r8, dword ptr arg(2) ; dst_ptich 363%endif 364 365.vp8_filter_block1d8_v6_sse2_loop: 366 movdqa xmm1, XMMWORD PTR [rsi] 367 pmullw xmm1, [rax] 368 369 movdqa xmm2, XMMWORD PTR [rsi + rdx] 370 pmullw xmm2, [rax + 16] 371 372 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] 373 pmullw xmm3, [rax + 32] 374 375 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] 376 pmullw xmm5, [rax + 64] 377 378 add rsi, rdx 379 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] 380 381 pmullw xmm4, [rax + 48] 382 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] 383 384 pmullw xmm6, [rax + 80] 385 386 paddsw xmm2, xmm5 387 paddsw xmm2, xmm3 388 389 paddsw xmm2, xmm1 390 paddsw xmm2, xmm4 391 392 paddsw xmm2, xmm6 393 paddsw xmm2, xmm7 394 395 psraw xmm2, 7 396 packuswb xmm2, xmm0 ; pack and saturate 397 398 movq QWORD PTR [rdi], xmm2 ; store the results in the destination 399%if ABI_IS_32BIT 400 add rdi, DWORD PTR arg(2) ;[dst_ptich] 401%else 402 add rdi, r8 403%endif 404 dec rcx ; decrement count 405 jnz .vp8_filter_block1d8_v6_sse2_loop ; next row 406 407 ; begin epilog 408 pop rdi 409 pop rsi 410 RESTORE_GOT 411 RESTORE_XMM 412 UNSHADOW_ARGS 413 pop rbp 414 ret 415 416 417;void vp8_filter_block1d16_v6_sse2 418;( 419; unsigned short *src_ptr, 420; unsigned char *output_ptr, 421; int dst_ptich, 422; unsigned int pixels_per_line, 423; unsigned int pixel_step, 424; unsigned int output_height, 425; unsigned int output_width, 426; const short *vp8_filter 427;) 428;/************************************************************************************ 429; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The 430; input pixel array has output_height rows. 431;*************************************************************************************/ 432global sym(vp8_filter_block1d16_v6_sse2) PRIVATE 433sym(vp8_filter_block1d16_v6_sse2): 434 push rbp 435 mov rbp, rsp 436 SHADOW_ARGS_TO_STACK 8 437 SAVE_XMM 7 438 GET_GOT rbx 439 push rsi 440 push rdi 441 ; end prolog 442 443 mov rax, arg(7) ;vp8_filter 444 movsxd rdx, dword ptr arg(3) ;pixels_per_line 445 446 mov rdi, arg(1) ;output_ptr 447 mov rsi, arg(0) ;src_ptr 448 449 sub rsi, rdx 450 sub rsi, rdx 451 452 movsxd rcx, DWORD PTR arg(5) ;[output_height] 453%if ABI_IS_32BIT=0 454 movsxd r8, dword ptr arg(2) ; dst_ptich 455%endif 456 457.vp8_filter_block1d16_v6_sse2_loop: 458; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. 459 movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 460 movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] 461 pmullw xmm1, [rax + 16] 462 pmullw xmm2, [rax + 16] 463 464 movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 465 movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] 466 pmullw xmm3, [rax + 64] 467 pmullw xmm4, [rax + 64] 468 469 movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 470 movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] 471 pmullw xmm5, [rax + 32] 472 pmullw xmm6, [rax + 32] 473 474 movdqa xmm7, XMMWORD PTR [rsi] ; line 1 475 movdqa xmm0, XMMWORD PTR [rsi + 16] 476 pmullw xmm7, [rax] 477 pmullw xmm0, [rax] 478 479 paddsw xmm1, xmm3 480 paddsw xmm2, xmm4 481 paddsw xmm1, xmm5 482 paddsw xmm2, xmm6 483 paddsw xmm1, xmm7 484 paddsw xmm2, xmm0 485 486 add rsi, rdx 487 488 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 489 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] 490 pmullw xmm3, [rax + 48] 491 pmullw xmm4, [rax + 48] 492 493 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 494 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] 495 pmullw xmm5, [rax + 80] 496 pmullw xmm6, [rax + 80] 497 498 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 499 pxor xmm0, xmm0 ; clear xmm0 500 501 paddsw xmm1, xmm3 502 paddsw xmm2, xmm4 503 paddsw xmm1, xmm5 504 paddsw xmm2, xmm6 505 506 paddsw xmm1, xmm7 507 paddsw xmm2, xmm7 508 509 psraw xmm1, 7 510 psraw xmm2, 7 511 512 packuswb xmm1, xmm2 ; pack and saturate 513 movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination 514%if ABI_IS_32BIT 515 add rdi, DWORD PTR arg(2) ;[dst_ptich] 516%else 517 add rdi, r8 518%endif 519 dec rcx ; decrement count 520 jnz .vp8_filter_block1d16_v6_sse2_loop ; next row 521 522 ; begin epilog 523 pop rdi 524 pop rsi 525 RESTORE_GOT 526 RESTORE_XMM 527 UNSHADOW_ARGS 528 pop rbp 529 ret 530 531 532;void vp8_filter_block1d8_h6_only_sse2 533;( 534; unsigned char *src_ptr, 535; unsigned int src_pixels_per_line, 536; unsigned char *output_ptr, 537; int dst_ptich, 538; unsigned int output_height, 539; const short *vp8_filter 540;) 541; First-pass filter only when yoffset==0 542global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE 543sym(vp8_filter_block1d8_h6_only_sse2): 544 push rbp 545 mov rbp, rsp 546 SHADOW_ARGS_TO_STACK 6 547 SAVE_XMM 7 548 GET_GOT rbx 549 push rsi 550 push rdi 551 ; end prolog 552 553 mov rdx, arg(5) ;vp8_filter 554 mov rsi, arg(0) ;src_ptr 555 556 mov rdi, arg(2) ;output_ptr 557 558 movsxd rcx, dword ptr arg(4) ;output_height 559 movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source 560%if ABI_IS_32BIT=0 561 movsxd r8, dword ptr arg(3) ;dst_ptich 562%endif 563 pxor xmm0, xmm0 ; clear xmm0 for unpack 564 565.filter_block1d8_h6_only_rowloop: 566 movq xmm3, MMWORD PTR [rsi - 2] 567 movq xmm1, MMWORD PTR [rsi + 6] 568 569 prefetcht2 [rsi+rax-2] 570 571 pslldq xmm1, 8 572 por xmm1, xmm3 573 574 movdqa xmm4, xmm1 575 movdqa xmm5, xmm1 576 577 movdqa xmm6, xmm1 578 movdqa xmm7, xmm1 579 580 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 581 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 582 583 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 584 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 585 586 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 587 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 588 589 590 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 591 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 592 593 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 594 595 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 596 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 597 598 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 599 600 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 601 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 602 603 604 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 605 606 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 607 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 608 609 610 paddsw xmm4, xmm7 611 paddsw xmm4, xmm5 612 613 paddsw xmm4, xmm3 614 paddsw xmm4, xmm6 615 616 paddsw xmm4, xmm1 617 paddsw xmm4, [GLOBAL(rd)] 618 619 psraw xmm4, 7 620 621 packuswb xmm4, xmm0 622 623 movq QWORD PTR [rdi], xmm4 ; store the results in the destination 624 lea rsi, [rsi + rax] 625 626%if ABI_IS_32BIT 627 add rdi, DWORD Ptr arg(3) ;dst_ptich 628%else 629 add rdi, r8 630%endif 631 dec rcx 632 633 jnz .filter_block1d8_h6_only_rowloop ; next row 634 635 ; begin epilog 636 pop rdi 637 pop rsi 638 RESTORE_GOT 639 RESTORE_XMM 640 UNSHADOW_ARGS 641 pop rbp 642 ret 643 644 645;void vp8_filter_block1d16_h6_only_sse2 646;( 647; unsigned char *src_ptr, 648; unsigned int src_pixels_per_line, 649; unsigned char *output_ptr, 650; int dst_ptich, 651; unsigned int output_height, 652; const short *vp8_filter 653;) 654; First-pass filter only when yoffset==0 655global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE 656sym(vp8_filter_block1d16_h6_only_sse2): 657 push rbp 658 mov rbp, rsp 659 SHADOW_ARGS_TO_STACK 6 660 SAVE_XMM 7 661 GET_GOT rbx 662 push rsi 663 push rdi 664 ; end prolog 665 666 mov rdx, arg(5) ;vp8_filter 667 mov rsi, arg(0) ;src_ptr 668 669 mov rdi, arg(2) ;output_ptr 670 671 movsxd rcx, dword ptr arg(4) ;output_height 672 movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source 673%if ABI_IS_32BIT=0 674 movsxd r8, dword ptr arg(3) ;dst_ptich 675%endif 676 677 pxor xmm0, xmm0 ; clear xmm0 for unpack 678 679.filter_block1d16_h6_only_sse2_rowloop: 680 movq xmm3, MMWORD PTR [rsi - 2] 681 movq xmm1, MMWORD PTR [rsi + 6] 682 683 movq xmm2, MMWORD PTR [rsi +14] 684 pslldq xmm2, 8 685 686 por xmm2, xmm1 687 prefetcht2 [rsi+rax-2] 688 689 pslldq xmm1, 8 690 por xmm1, xmm3 691 692 movdqa xmm4, xmm1 693 movdqa xmm5, xmm1 694 695 movdqa xmm6, xmm1 696 movdqa xmm7, xmm1 697 698 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 699 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 700 701 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 702 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 703 704 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 705 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 706 707 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 708 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 709 710 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 711 712 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 713 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 714 715 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 716 717 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 718 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 719 720 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 721 722 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 723 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 724 725 paddsw xmm4, xmm7 726 paddsw xmm4, xmm5 727 728 paddsw xmm4, xmm3 729 paddsw xmm4, xmm6 730 731 paddsw xmm4, xmm1 732 paddsw xmm4, [GLOBAL(rd)] 733 734 psraw xmm4, 7 735 736 packuswb xmm4, xmm0 ; lower 8 bytes 737 738 movq QWORD Ptr [rdi], xmm4 ; store the results in the destination 739 740 movdqa xmm3, xmm2 741 movdqa xmm4, xmm2 742 743 movdqa xmm5, xmm2 744 movdqa xmm6, xmm2 745 746 movdqa xmm7, xmm2 747 748 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 749 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 750 751 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 752 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 753 754 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 755 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 756 757 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 758 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 759 760 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 761 762 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 763 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 764 765 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 766 767 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 768 psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 769 770 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 771 772 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 773 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 774 775 paddsw xmm4, xmm7 776 paddsw xmm4, xmm5 777 778 paddsw xmm4, xmm3 779 paddsw xmm4, xmm6 780 781 paddsw xmm4, xmm2 782 paddsw xmm4, [GLOBAL(rd)] 783 784 psraw xmm4, 7 785 786 packuswb xmm4, xmm0 ; higher 8 bytes 787 788 movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination 789 790 lea rsi, [rsi + rax] 791%if ABI_IS_32BIT 792 add rdi, DWORD Ptr arg(3) ;dst_ptich 793%else 794 add rdi, r8 795%endif 796 797 dec rcx 798 jnz .filter_block1d16_h6_only_sse2_rowloop ; next row 799 800 ; begin epilog 801 pop rdi 802 pop rsi 803 RESTORE_GOT 804 RESTORE_XMM 805 UNSHADOW_ARGS 806 pop rbp 807 ret 808 809 810;void vp8_filter_block1d8_v6_only_sse2 811;( 812; unsigned char *src_ptr, 813; unsigned int src_pixels_per_line, 814; unsigned char *output_ptr, 815; int dst_ptich, 816; unsigned int output_height, 817; const short *vp8_filter 818;) 819; Second-pass filter only when xoffset==0 820global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE 821sym(vp8_filter_block1d8_v6_only_sse2): 822 push rbp 823 mov rbp, rsp 824 SHADOW_ARGS_TO_STACK 6 825 SAVE_XMM 7 826 GET_GOT rbx 827 push rsi 828 push rdi 829 ; end prolog 830 831 mov rsi, arg(0) ;src_ptr 832 mov rdi, arg(2) ;output_ptr 833 834 movsxd rcx, dword ptr arg(4) ;output_height 835 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 836 837 mov rax, arg(5) ;vp8_filter 838 839 pxor xmm0, xmm0 ; clear xmm0 840 841 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 842%if ABI_IS_32BIT=0 843 movsxd r8, dword ptr arg(3) ; dst_ptich 844%endif 845 846.vp8_filter_block1d8_v6_only_sse2_loop: 847 movq xmm1, MMWORD PTR [rsi] 848 movq xmm2, MMWORD PTR [rsi + rdx] 849 movq xmm3, MMWORD PTR [rsi + rdx * 2] 850 movq xmm5, MMWORD PTR [rsi + rdx * 4] 851 add rsi, rdx 852 movq xmm4, MMWORD PTR [rsi + rdx * 2] 853 movq xmm6, MMWORD PTR [rsi + rdx * 4] 854 855 punpcklbw xmm1, xmm0 856 pmullw xmm1, [rax] 857 858 punpcklbw xmm2, xmm0 859 pmullw xmm2, [rax + 16] 860 861 punpcklbw xmm3, xmm0 862 pmullw xmm3, [rax + 32] 863 864 punpcklbw xmm5, xmm0 865 pmullw xmm5, [rax + 64] 866 867 punpcklbw xmm4, xmm0 868 pmullw xmm4, [rax + 48] 869 870 punpcklbw xmm6, xmm0 871 pmullw xmm6, [rax + 80] 872 873 paddsw xmm2, xmm5 874 paddsw xmm2, xmm3 875 876 paddsw xmm2, xmm1 877 paddsw xmm2, xmm4 878 879 paddsw xmm2, xmm6 880 paddsw xmm2, xmm7 881 882 psraw xmm2, 7 883 packuswb xmm2, xmm0 ; pack and saturate 884 885 movq QWORD PTR [rdi], xmm2 ; store the results in the destination 886%if ABI_IS_32BIT 887 add rdi, DWORD PTR arg(3) ;[dst_ptich] 888%else 889 add rdi, r8 890%endif 891 dec rcx ; decrement count 892 jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row 893 894 ; begin epilog 895 pop rdi 896 pop rsi 897 RESTORE_GOT 898 RESTORE_XMM 899 UNSHADOW_ARGS 900 pop rbp 901 ret 902 903 904;void vp8_unpack_block1d16_h6_sse2 905;( 906; unsigned char *src_ptr, 907; unsigned short *output_ptr, 908; unsigned int src_pixels_per_line, 909; unsigned int output_height, 910; unsigned int output_width 911;) 912global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE 913sym(vp8_unpack_block1d16_h6_sse2): 914 push rbp 915 mov rbp, rsp 916 SHADOW_ARGS_TO_STACK 5 917 GET_GOT rbx 918 push rsi 919 push rdi 920 ; end prolog 921 922 mov rsi, arg(0) ;src_ptr 923 mov rdi, arg(1) ;output_ptr 924 925 movsxd rcx, dword ptr arg(3) ;output_height 926 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 927 928 pxor xmm0, xmm0 ; clear xmm0 for unpack 929%if ABI_IS_32BIT=0 930 movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source 931%endif 932 933.unpack_block1d16_h6_sse2_rowloop: 934 movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 935 movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 936 937 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 938 punpcklbw xmm1, xmm0 939 940 movdqa XMMWORD Ptr [rdi], xmm1 941 movdqa XMMWORD Ptr [rdi + 16], xmm3 942 943 lea rsi, [rsi + rax] 944%if ABI_IS_32BIT 945 add rdi, DWORD Ptr arg(4) ;[output_width] 946%else 947 add rdi, r8 948%endif 949 dec rcx 950 jnz .unpack_block1d16_h6_sse2_rowloop ; next row 951 952 ; begin epilog 953 pop rdi 954 pop rsi 955 RESTORE_GOT 956 UNSHADOW_ARGS 957 pop rbp 958 ret 959 960 961;void vp8_bilinear_predict16x16_sse2 962;( 963; unsigned char *src_ptr, 964; int src_pixels_per_line, 965; int xoffset, 966; int yoffset, 967; unsigned char *dst_ptr, 968; int dst_pitch 969;) 970extern sym(vp8_bilinear_filters_x86_8) 971global sym(vp8_bilinear_predict16x16_sse2) PRIVATE 972sym(vp8_bilinear_predict16x16_sse2): 973 push rbp 974 mov rbp, rsp 975 SHADOW_ARGS_TO_STACK 6 976 SAVE_XMM 7 977 GET_GOT rbx 978 push rsi 979 push rdi 980 ; end prolog 981 982 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] 983 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] 984 985 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] 986 movsxd rax, dword ptr arg(2) ;xoffset 987 988 cmp rax, 0 ;skip first_pass filter if xoffset=0 989 je .b16x16_sp_only 990 991 shl rax, 5 992 add rax, rcx ;HFilter 993 994 mov rdi, arg(4) ;dst_ptr 995 mov rsi, arg(0) ;src_ptr 996 movsxd rdx, dword ptr arg(5) ;dst_pitch 997 998 movdqa xmm1, [rax] 999 movdqa xmm2, [rax+16] 1000 1001 movsxd rax, dword ptr arg(3) ;yoffset 1002 1003 cmp rax, 0 ;skip second_pass filter if yoffset=0 1004 je .b16x16_fp_only 1005 1006 shl rax, 5 1007 add rax, rcx ;VFilter 1008 1009 lea rcx, [rdi+rdx*8] 1010 lea rcx, [rcx+rdx*8] 1011 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 1012 1013 pxor xmm0, xmm0 1014 1015%if ABI_IS_32BIT=0 1016 movsxd r8, dword ptr arg(5) ;dst_pitch 1017%endif 1018 ; get the first horizontal line done 1019 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1020 movdqa xmm4, xmm3 ; make a copy of current line 1021 1022 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 1023 punpckhbw xmm4, xmm0 1024 1025 pmullw xmm3, xmm1 1026 pmullw xmm4, xmm1 1027 1028 movdqu xmm5, [rsi+1] 1029 movdqa xmm6, xmm5 1030 1031 punpcklbw xmm5, xmm0 1032 punpckhbw xmm6, xmm0 1033 1034 pmullw xmm5, xmm2 1035 pmullw xmm6, xmm2 1036 1037 paddw xmm3, xmm5 1038 paddw xmm4, xmm6 1039 1040 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1041 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1042 1043 paddw xmm4, [GLOBAL(rd)] 1044 psraw xmm4, VP8_FILTER_SHIFT 1045 1046 movdqa xmm7, xmm3 1047 packuswb xmm7, xmm4 1048 1049 add rsi, rdx ; next line 1050.next_row: 1051 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1052 movdqa xmm4, xmm3 ; make a copy of current line 1053 1054 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 1055 punpckhbw xmm4, xmm0 1056 1057 pmullw xmm3, xmm1 1058 pmullw xmm4, xmm1 1059 1060 movdqu xmm5, [rsi+1] 1061 movdqa xmm6, xmm5 1062 1063 punpcklbw xmm5, xmm0 1064 punpckhbw xmm6, xmm0 1065 1066 pmullw xmm5, xmm2 1067 pmullw xmm6, xmm2 1068 1069 paddw xmm3, xmm5 1070 paddw xmm4, xmm6 1071 1072 movdqa xmm5, xmm7 1073 movdqa xmm6, xmm7 1074 1075 punpcklbw xmm5, xmm0 1076 punpckhbw xmm6, xmm0 1077 1078 pmullw xmm5, [rax] 1079 pmullw xmm6, [rax] 1080 1081 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1082 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1083 1084 paddw xmm4, [GLOBAL(rd)] 1085 psraw xmm4, VP8_FILTER_SHIFT 1086 1087 movdqa xmm7, xmm3 1088 packuswb xmm7, xmm4 1089 1090 pmullw xmm3, [rax+16] 1091 pmullw xmm4, [rax+16] 1092 1093 paddw xmm3, xmm5 1094 paddw xmm4, xmm6 1095 1096 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1097 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1098 1099 paddw xmm4, [GLOBAL(rd)] 1100 psraw xmm4, VP8_FILTER_SHIFT 1101 1102 packuswb xmm3, xmm4 1103 movdqa [rdi], xmm3 ; store the results in the destination 1104 1105 add rsi, rdx ; next line 1106%if ABI_IS_32BIT 1107 add rdi, DWORD PTR arg(5) ;dst_pitch 1108%else 1109 add rdi, r8 1110%endif 1111 1112 cmp rdi, rcx 1113 jne .next_row 1114 1115 jmp .done 1116 1117.b16x16_sp_only: 1118 movsxd rax, dword ptr arg(3) ;yoffset 1119 shl rax, 5 1120 add rax, rcx ;VFilter 1121 1122 mov rdi, arg(4) ;dst_ptr 1123 mov rsi, arg(0) ;src_ptr 1124 movsxd rdx, dword ptr arg(5) ;dst_pitch 1125 1126 movdqa xmm1, [rax] 1127 movdqa xmm2, [rax+16] 1128 1129 lea rcx, [rdi+rdx*8] 1130 lea rcx, [rcx+rdx*8] 1131 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 1132 1133 pxor xmm0, xmm0 1134 1135 ; get the first horizontal line done 1136 movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1137 1138 add rsi, rax ; next line 1139.next_row_spo: 1140 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1141 1142 movdqa xmm5, xmm7 1143 movdqa xmm6, xmm7 1144 1145 movdqa xmm4, xmm3 ; make a copy of current line 1146 movdqa xmm7, xmm3 1147 1148 punpcklbw xmm5, xmm0 1149 punpckhbw xmm6, xmm0 1150 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 1151 punpckhbw xmm4, xmm0 1152 1153 pmullw xmm5, xmm1 1154 pmullw xmm6, xmm1 1155 pmullw xmm3, xmm2 1156 pmullw xmm4, xmm2 1157 1158 paddw xmm3, xmm5 1159 paddw xmm4, xmm6 1160 1161 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1162 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1163 1164 paddw xmm4, [GLOBAL(rd)] 1165 psraw xmm4, VP8_FILTER_SHIFT 1166 1167 packuswb xmm3, xmm4 1168 movdqa [rdi], xmm3 ; store the results in the destination 1169 1170 add rsi, rax ; next line 1171 add rdi, rdx ;dst_pitch 1172 cmp rdi, rcx 1173 jne .next_row_spo 1174 1175 jmp .done 1176 1177.b16x16_fp_only: 1178 lea rcx, [rdi+rdx*8] 1179 lea rcx, [rcx+rdx*8] 1180 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 1181 pxor xmm0, xmm0 1182 1183.next_row_fpo: 1184 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1185 movdqa xmm4, xmm3 ; make a copy of current line 1186 1187 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 1188 punpckhbw xmm4, xmm0 1189 1190 pmullw xmm3, xmm1 1191 pmullw xmm4, xmm1 1192 1193 movdqu xmm5, [rsi+1] 1194 movdqa xmm6, xmm5 1195 1196 punpcklbw xmm5, xmm0 1197 punpckhbw xmm6, xmm0 1198 1199 pmullw xmm5, xmm2 1200 pmullw xmm6, xmm2 1201 1202 paddw xmm3, xmm5 1203 paddw xmm4, xmm6 1204 1205 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1206 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1207 1208 paddw xmm4, [GLOBAL(rd)] 1209 psraw xmm4, VP8_FILTER_SHIFT 1210 1211 packuswb xmm3, xmm4 1212 movdqa [rdi], xmm3 ; store the results in the destination 1213 1214 add rsi, rax ; next line 1215 add rdi, rdx ; dst_pitch 1216 cmp rdi, rcx 1217 jne .next_row_fpo 1218 1219.done: 1220 ; begin epilog 1221 pop rdi 1222 pop rsi 1223 RESTORE_GOT 1224 RESTORE_XMM 1225 UNSHADOW_ARGS 1226 pop rbp 1227 ret 1228 1229 1230;void vp8_bilinear_predict8x8_sse2 1231;( 1232; unsigned char *src_ptr, 1233; int src_pixels_per_line, 1234; int xoffset, 1235; int yoffset, 1236; unsigned char *dst_ptr, 1237; int dst_pitch 1238;) 1239global sym(vp8_bilinear_predict8x8_sse2) PRIVATE 1240sym(vp8_bilinear_predict8x8_sse2): 1241 push rbp 1242 mov rbp, rsp 1243 SHADOW_ARGS_TO_STACK 6 1244 SAVE_XMM 7 1245 GET_GOT rbx 1246 push rsi 1247 push rdi 1248 ; end prolog 1249 1250 ALIGN_STACK 16, rax 1251 sub rsp, 144 ; reserve 144 bytes 1252 1253 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] 1254 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] 1255 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] 1256 1257 mov rsi, arg(0) ;src_ptr 1258 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 1259 1260 ;Read 9-line unaligned data in and put them on stack. This gives a big 1261 ;performance boost. 1262 movdqu xmm0, [rsi] 1263 lea rax, [rdx + rdx*2] 1264 movdqu xmm1, [rsi+rdx] 1265 movdqu xmm2, [rsi+rdx*2] 1266 add rsi, rax 1267 movdqu xmm3, [rsi] 1268 movdqu xmm4, [rsi+rdx] 1269 movdqu xmm5, [rsi+rdx*2] 1270 add rsi, rax 1271 movdqu xmm6, [rsi] 1272 movdqu xmm7, [rsi+rdx] 1273 1274 movdqa XMMWORD PTR [rsp], xmm0 1275 1276 movdqu xmm0, [rsi+rdx*2] 1277 1278 movdqa XMMWORD PTR [rsp+16], xmm1 1279 movdqa XMMWORD PTR [rsp+32], xmm2 1280 movdqa XMMWORD PTR [rsp+48], xmm3 1281 movdqa XMMWORD PTR [rsp+64], xmm4 1282 movdqa XMMWORD PTR [rsp+80], xmm5 1283 movdqa XMMWORD PTR [rsp+96], xmm6 1284 movdqa XMMWORD PTR [rsp+112], xmm7 1285 movdqa XMMWORD PTR [rsp+128], xmm0 1286 1287 movsxd rax, dword ptr arg(2) ;xoffset 1288 shl rax, 5 1289 add rax, rcx ;HFilter 1290 1291 mov rdi, arg(4) ;dst_ptr 1292 movsxd rdx, dword ptr arg(5) ;dst_pitch 1293 1294 movdqa xmm1, [rax] 1295 movdqa xmm2, [rax+16] 1296 1297 movsxd rax, dword ptr arg(3) ;yoffset 1298 shl rax, 5 1299 add rax, rcx ;VFilter 1300 1301 lea rcx, [rdi+rdx*8] 1302 1303 movdqa xmm5, [rax] 1304 movdqa xmm6, [rax+16] 1305 1306 pxor xmm0, xmm0 1307 1308 ; get the first horizontal line done 1309 movdqa xmm3, XMMWORD PTR [rsp] 1310 movdqa xmm4, xmm3 ; make a copy of current line 1311 psrldq xmm4, 1 1312 1313 punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 1314 punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 1315 1316 pmullw xmm3, xmm1 1317 pmullw xmm4, xmm2 1318 1319 paddw xmm3, xmm4 1320 1321 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1322 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1323 1324 movdqa xmm7, xmm3 1325 add rsp, 16 ; next line 1326.next_row8x8: 1327 movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1328 movdqa xmm4, xmm3 ; make a copy of current line 1329 psrldq xmm4, 1 1330 1331 punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 1332 punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 1333 1334 pmullw xmm3, xmm1 1335 pmullw xmm4, xmm2 1336 1337 paddw xmm3, xmm4 1338 pmullw xmm7, xmm5 1339 1340 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1341 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1342 1343 movdqa xmm4, xmm3 1344 1345 pmullw xmm3, xmm6 1346 paddw xmm3, xmm7 1347 1348 movdqa xmm7, xmm4 1349 1350 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1351 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1352 1353 packuswb xmm3, xmm0 1354 movq [rdi], xmm3 ; store the results in the destination 1355 1356 add rsp, 16 ; next line 1357 add rdi, rdx 1358 1359 cmp rdi, rcx 1360 jne .next_row8x8 1361 1362 ;add rsp, 144 1363 pop rsp 1364 ; begin epilog 1365 pop rdi 1366 pop rsi 1367 RESTORE_GOT 1368 RESTORE_XMM 1369 UNSHADOW_ARGS 1370 pop rbp 1371 ret 1372 1373 1374SECTION_RODATA 1375align 16 1376rd: 1377 times 8 dw 0x40 1378