1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%define BLOCK_HEIGHT_WIDTH 4 15%define VP8_FILTER_WEIGHT 128 16%define VP8_FILTER_SHIFT 7 17 18SECTION .text 19 20;/************************************************************************************ 21; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 22; input pixel array has output_height rows. This routine assumes that output_height is an 23; even number. This function handles 8 pixels in horizontal direction, calculating ONE 24; rows each iteration to take advantage of the 128 bits operations. 25;*************************************************************************************/ 26;void vp8_filter_block1d8_h6_sse2 27;( 28; unsigned char *src_ptr, 29; unsigned short *output_ptr, 30; unsigned int src_pixels_per_line, 31; unsigned int pixel_step, 32; unsigned int output_height, 33; unsigned int output_width, 34; short *vp8_filter 35;) 36global sym(vp8_filter_block1d8_h6_sse2) PRIVATE 37sym(vp8_filter_block1d8_h6_sse2): 38 push rbp 39 mov rbp, rsp 40 SHADOW_ARGS_TO_STACK 7 41 SAVE_XMM 7 42 GET_GOT rbx 43 push rsi 44 push rdi 45 ; end prolog 46 47 mov rdx, arg(6) ;vp8_filter 48 mov rsi, arg(0) ;src_ptr 49 50 mov rdi, arg(1) ;output_ptr 51 52 movsxd rcx, dword ptr arg(4) ;output_height 53 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 54%if ABI_IS_32BIT=0 55 movsxd r8, dword ptr arg(5) ;output_width 56%endif 57 pxor xmm0, xmm0 ; clear xmm0 for unpack 58 59.filter_block1d8_h6_rowloop: 60 movq xmm3, MMWORD PTR [rsi - 2] 61 movq xmm1, MMWORD PTR [rsi + 6] 62 63 prefetcht2 [rsi+rax-2] 64 65 pslldq xmm1, 8 66 por xmm1, xmm3 67 68 movdqa xmm4, xmm1 69 movdqa xmm5, xmm1 70 71 movdqa xmm6, xmm1 72 movdqa xmm7, xmm1 73 74 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 75 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 76 77 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 78 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 79 80 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 81 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 82 83 84 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 85 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 86 87 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 88 89 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 90 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 91 92 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 93 94 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 95 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 96 97 98 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 99 100 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 101 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 102 103 104 paddsw xmm4, xmm7 105 paddsw xmm4, xmm5 106 107 paddsw xmm4, xmm3 108 paddsw xmm4, xmm6 109 110 paddsw xmm4, xmm1 111 paddsw xmm4, [GLOBAL(rd)] 112 113 psraw xmm4, 7 114 115 packuswb xmm4, xmm0 116 punpcklbw xmm4, xmm0 117 118 movdqa XMMWORD Ptr [rdi], xmm4 119 lea rsi, [rsi + rax] 120 121%if ABI_IS_32BIT 122 add rdi, DWORD Ptr arg(5) ;[output_width] 123%else 124 add rdi, r8 125%endif 126 dec rcx 127 128 jnz .filter_block1d8_h6_rowloop ; next row 129 130 ; begin epilog 131 pop rdi 132 pop rsi 133 RESTORE_GOT 134 RESTORE_XMM 135 UNSHADOW_ARGS 136 pop rbp 137 ret 138 139 140;void vp8_filter_block1d16_h6_sse2 141;( 142; unsigned char *src_ptr, 143; unsigned short *output_ptr, 144; unsigned int src_pixels_per_line, 145; unsigned int pixel_step, 146; unsigned int output_height, 147; unsigned int output_width, 148; short *vp8_filter 149;) 150;/************************************************************************************ 151; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 152; input pixel array has output_height rows. This routine assumes that output_height is an 153; even number. This function handles 8 pixels in horizontal direction, calculating ONE 154; rows each iteration to take advantage of the 128 bits operations. 155;*************************************************************************************/ 156global sym(vp8_filter_block1d16_h6_sse2) PRIVATE 157sym(vp8_filter_block1d16_h6_sse2): 158 push rbp 159 mov rbp, rsp 160 SHADOW_ARGS_TO_STACK 7 161 SAVE_XMM 7 162 GET_GOT rbx 163 push rsi 164 push rdi 165 ; end prolog 166 167 mov rdx, arg(6) ;vp8_filter 168 mov rsi, arg(0) ;src_ptr 169 170 mov rdi, arg(1) ;output_ptr 171 172 movsxd rcx, dword ptr arg(4) ;output_height 173 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 174%if ABI_IS_32BIT=0 175 movsxd r8, dword ptr arg(5) ;output_width 176%endif 177 178 pxor xmm0, xmm0 ; clear xmm0 for unpack 179 180.filter_block1d16_h6_sse2_rowloop: 181 movq xmm3, MMWORD PTR [rsi - 2] 182 movq xmm1, MMWORD PTR [rsi + 6] 183 184 ; Load from 11 to avoid reading out of bounds. 185 movq xmm2, MMWORD PTR [rsi +11] 186 ; The lower bits are not cleared before 'or'ing with xmm1, 187 ; but that is OK because the values in the overlapping positions 188 ; are already equal to the ones in xmm1. 189 pslldq xmm2, 5 190 191 por xmm2, xmm1 192 prefetcht2 [rsi+rax-2] 193 194 pslldq xmm1, 8 195 por xmm1, xmm3 196 197 movdqa xmm4, xmm1 198 movdqa xmm5, xmm1 199 200 movdqa xmm6, xmm1 201 movdqa xmm7, xmm1 202 203 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 204 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 205 206 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 207 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 208 209 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 210 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 211 212 213 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 214 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 215 216 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 217 218 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 219 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 220 221 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 222 223 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 224 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 225 226 227 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 228 229 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 230 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 231 232 paddsw xmm4, xmm7 233 paddsw xmm4, xmm5 234 235 paddsw xmm4, xmm3 236 paddsw xmm4, xmm6 237 238 paddsw xmm4, xmm1 239 paddsw xmm4, [GLOBAL(rd)] 240 241 psraw xmm4, 7 242 243 packuswb xmm4, xmm0 244 punpcklbw xmm4, xmm0 245 246 movdqa XMMWORD Ptr [rdi], xmm4 247 248 movdqa xmm3, xmm2 249 movdqa xmm4, xmm2 250 251 movdqa xmm5, xmm2 252 movdqa xmm6, xmm2 253 254 movdqa xmm7, xmm2 255 256 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 257 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 258 259 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 260 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 261 262 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 263 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 264 265 266 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 267 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 268 269 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 270 271 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 272 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 273 274 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 275 276 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 277 psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 278 279 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 280 281 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 282 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 283 284 285 paddsw xmm4, xmm7 286 paddsw xmm4, xmm5 287 288 paddsw xmm4, xmm3 289 paddsw xmm4, xmm6 290 291 paddsw xmm4, xmm2 292 paddsw xmm4, [GLOBAL(rd)] 293 294 psraw xmm4, 7 295 296 packuswb xmm4, xmm0 297 punpcklbw xmm4, xmm0 298 299 movdqa XMMWORD Ptr [rdi+16], xmm4 300 301 lea rsi, [rsi + rax] 302%if ABI_IS_32BIT 303 add rdi, DWORD Ptr arg(5) ;[output_width] 304%else 305 add rdi, r8 306%endif 307 308 dec rcx 309 jnz .filter_block1d16_h6_sse2_rowloop ; next row 310 311 ; begin epilog 312 pop rdi 313 pop rsi 314 RESTORE_GOT 315 RESTORE_XMM 316 UNSHADOW_ARGS 317 pop rbp 318 ret 319 320 321;void vp8_filter_block1d8_v6_sse2 322;( 323; short *src_ptr, 324; unsigned char *output_ptr, 325; int dst_ptich, 326; unsigned int pixels_per_line, 327; unsigned int pixel_step, 328; unsigned int output_height, 329; unsigned int output_width, 330; short * vp8_filter 331;) 332;/************************************************************************************ 333; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The 334; input pixel array has output_height rows. 335;*************************************************************************************/ 336global sym(vp8_filter_block1d8_v6_sse2) PRIVATE 337sym(vp8_filter_block1d8_v6_sse2): 338 push rbp 339 mov rbp, rsp 340 SHADOW_ARGS_TO_STACK 8 341 SAVE_XMM 7 342 GET_GOT rbx 343 push rsi 344 push rdi 345 ; end prolog 346 347 mov rax, arg(7) ;vp8_filter 348 movsxd rdx, dword ptr arg(3) ;pixels_per_line 349 350 mov rdi, arg(1) ;output_ptr 351 mov rsi, arg(0) ;src_ptr 352 353 sub rsi, rdx 354 sub rsi, rdx 355 356 movsxd rcx, DWORD PTR arg(5) ;[output_height] 357 pxor xmm0, xmm0 ; clear xmm0 358 359 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 360%if ABI_IS_32BIT=0 361 movsxd r8, dword ptr arg(2) ; dst_ptich 362%endif 363 364.vp8_filter_block1d8_v6_sse2_loop: 365 movdqa xmm1, XMMWORD PTR [rsi] 366 pmullw xmm1, [rax] 367 368 movdqa xmm2, XMMWORD PTR [rsi + rdx] 369 pmullw xmm2, [rax + 16] 370 371 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] 372 pmullw xmm3, [rax + 32] 373 374 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] 375 pmullw xmm5, [rax + 64] 376 377 add rsi, rdx 378 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] 379 380 pmullw xmm4, [rax + 48] 381 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] 382 383 pmullw xmm6, [rax + 80] 384 385 paddsw xmm2, xmm5 386 paddsw xmm2, xmm3 387 388 paddsw xmm2, xmm1 389 paddsw xmm2, xmm4 390 391 paddsw xmm2, xmm6 392 paddsw xmm2, xmm7 393 394 psraw xmm2, 7 395 packuswb xmm2, xmm0 ; pack and saturate 396 397 movq QWORD PTR [rdi], xmm2 ; store the results in the destination 398%if ABI_IS_32BIT 399 add rdi, DWORD PTR arg(2) ;[dst_ptich] 400%else 401 add rdi, r8 402%endif 403 dec rcx ; decrement count 404 jnz .vp8_filter_block1d8_v6_sse2_loop ; next row 405 406 ; begin epilog 407 pop rdi 408 pop rsi 409 RESTORE_GOT 410 RESTORE_XMM 411 UNSHADOW_ARGS 412 pop rbp 413 ret 414 415 416;void vp8_filter_block1d16_v6_sse2 417;( 418; unsigned short *src_ptr, 419; unsigned char *output_ptr, 420; int dst_ptich, 421; unsigned int pixels_per_line, 422; unsigned int pixel_step, 423; unsigned int output_height, 424; unsigned int output_width, 425; const short *vp8_filter 426;) 427;/************************************************************************************ 428; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The 429; input pixel array has output_height rows. 430;*************************************************************************************/ 431global sym(vp8_filter_block1d16_v6_sse2) PRIVATE 432sym(vp8_filter_block1d16_v6_sse2): 433 push rbp 434 mov rbp, rsp 435 SHADOW_ARGS_TO_STACK 8 436 SAVE_XMM 7 437 GET_GOT rbx 438 push rsi 439 push rdi 440 ; end prolog 441 442 mov rax, arg(7) ;vp8_filter 443 movsxd rdx, dword ptr arg(3) ;pixels_per_line 444 445 mov rdi, arg(1) ;output_ptr 446 mov rsi, arg(0) ;src_ptr 447 448 sub rsi, rdx 449 sub rsi, rdx 450 451 movsxd rcx, DWORD PTR arg(5) ;[output_height] 452%if ABI_IS_32BIT=0 453 movsxd r8, dword ptr arg(2) ; dst_ptich 454%endif 455 456.vp8_filter_block1d16_v6_sse2_loop: 457; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. 458 movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 459 movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] 460 pmullw xmm1, [rax + 16] 461 pmullw xmm2, [rax + 16] 462 463 movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 464 movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] 465 pmullw xmm3, [rax + 64] 466 pmullw xmm4, [rax + 64] 467 468 movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 469 movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] 470 pmullw xmm5, [rax + 32] 471 pmullw xmm6, [rax + 32] 472 473 movdqa xmm7, XMMWORD PTR [rsi] ; line 1 474 movdqa xmm0, XMMWORD PTR [rsi + 16] 475 pmullw xmm7, [rax] 476 pmullw xmm0, [rax] 477 478 paddsw xmm1, xmm3 479 paddsw xmm2, xmm4 480 paddsw xmm1, xmm5 481 paddsw xmm2, xmm6 482 paddsw xmm1, xmm7 483 paddsw xmm2, xmm0 484 485 add rsi, rdx 486 487 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 488 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] 489 pmullw xmm3, [rax + 48] 490 pmullw xmm4, [rax + 48] 491 492 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 493 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] 494 pmullw xmm5, [rax + 80] 495 pmullw xmm6, [rax + 80] 496 497 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 498 pxor xmm0, xmm0 ; clear xmm0 499 500 paddsw xmm1, xmm3 501 paddsw xmm2, xmm4 502 paddsw xmm1, xmm5 503 paddsw xmm2, xmm6 504 505 paddsw xmm1, xmm7 506 paddsw xmm2, xmm7 507 508 psraw xmm1, 7 509 psraw xmm2, 7 510 511 packuswb xmm1, xmm2 ; pack and saturate 512 movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination 513%if ABI_IS_32BIT 514 add rdi, DWORD PTR arg(2) ;[dst_ptich] 515%else 516 add rdi, r8 517%endif 518 dec rcx ; decrement count 519 jnz .vp8_filter_block1d16_v6_sse2_loop ; next row 520 521 ; begin epilog 522 pop rdi 523 pop rsi 524 RESTORE_GOT 525 RESTORE_XMM 526 UNSHADOW_ARGS 527 pop rbp 528 ret 529 530 531;void vp8_filter_block1d8_h6_only_sse2 532;( 533; unsigned char *src_ptr, 534; unsigned int src_pixels_per_line, 535; unsigned char *output_ptr, 536; int dst_ptich, 537; unsigned int output_height, 538; const short *vp8_filter 539;) 540; First-pass filter only when yoffset==0 541global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE 542sym(vp8_filter_block1d8_h6_only_sse2): 543 push rbp 544 mov rbp, rsp 545 SHADOW_ARGS_TO_STACK 6 546 SAVE_XMM 7 547 GET_GOT rbx 548 push rsi 549 push rdi 550 ; end prolog 551 552 mov rdx, arg(5) ;vp8_filter 553 mov rsi, arg(0) ;src_ptr 554 555 mov rdi, arg(2) ;output_ptr 556 557 movsxd rcx, dword ptr arg(4) ;output_height 558 movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source 559%if ABI_IS_32BIT=0 560 movsxd r8, dword ptr arg(3) ;dst_ptich 561%endif 562 pxor xmm0, xmm0 ; clear xmm0 for unpack 563 564.filter_block1d8_h6_only_rowloop: 565 movq xmm3, MMWORD PTR [rsi - 2] 566 movq xmm1, MMWORD PTR [rsi + 6] 567 568 prefetcht2 [rsi+rax-2] 569 570 pslldq xmm1, 8 571 por xmm1, xmm3 572 573 movdqa xmm4, xmm1 574 movdqa xmm5, xmm1 575 576 movdqa xmm6, xmm1 577 movdqa xmm7, xmm1 578 579 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 580 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 581 582 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 583 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 584 585 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 586 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 587 588 589 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 590 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 591 592 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 593 594 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 595 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 596 597 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 598 599 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 600 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 601 602 603 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 604 605 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 606 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 607 608 609 paddsw xmm4, xmm7 610 paddsw xmm4, xmm5 611 612 paddsw xmm4, xmm3 613 paddsw xmm4, xmm6 614 615 paddsw xmm4, xmm1 616 paddsw xmm4, [GLOBAL(rd)] 617 618 psraw xmm4, 7 619 620 packuswb xmm4, xmm0 621 622 movq QWORD PTR [rdi], xmm4 ; store the results in the destination 623 lea rsi, [rsi + rax] 624 625%if ABI_IS_32BIT 626 add rdi, DWORD Ptr arg(3) ;dst_ptich 627%else 628 add rdi, r8 629%endif 630 dec rcx 631 632 jnz .filter_block1d8_h6_only_rowloop ; next row 633 634 ; begin epilog 635 pop rdi 636 pop rsi 637 RESTORE_GOT 638 RESTORE_XMM 639 UNSHADOW_ARGS 640 pop rbp 641 ret 642 643 644;void vp8_filter_block1d16_h6_only_sse2 645;( 646; unsigned char *src_ptr, 647; unsigned int src_pixels_per_line, 648; unsigned char *output_ptr, 649; int dst_ptich, 650; unsigned int output_height, 651; const short *vp8_filter 652;) 653; First-pass filter only when yoffset==0 654global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE 655sym(vp8_filter_block1d16_h6_only_sse2): 656 push rbp 657 mov rbp, rsp 658 SHADOW_ARGS_TO_STACK 6 659 SAVE_XMM 7 660 GET_GOT rbx 661 push rsi 662 push rdi 663 ; end prolog 664 665 mov rdx, arg(5) ;vp8_filter 666 mov rsi, arg(0) ;src_ptr 667 668 mov rdi, arg(2) ;output_ptr 669 670 movsxd rcx, dword ptr arg(4) ;output_height 671 movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source 672%if ABI_IS_32BIT=0 673 movsxd r8, dword ptr arg(3) ;dst_ptich 674%endif 675 676 pxor xmm0, xmm0 ; clear xmm0 for unpack 677 678.filter_block1d16_h6_only_sse2_rowloop: 679 movq xmm3, MMWORD PTR [rsi - 2] 680 movq xmm1, MMWORD PTR [rsi + 6] 681 682 movq xmm2, MMWORD PTR [rsi +14] 683 pslldq xmm2, 8 684 685 por xmm2, xmm1 686 prefetcht2 [rsi+rax-2] 687 688 pslldq xmm1, 8 689 por xmm1, xmm3 690 691 movdqa xmm4, xmm1 692 movdqa xmm5, xmm1 693 694 movdqa xmm6, xmm1 695 movdqa xmm7, xmm1 696 697 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 698 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 699 700 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 701 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 702 703 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 704 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 705 706 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 707 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 708 709 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 710 711 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 712 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 713 714 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 715 716 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 717 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 718 719 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 720 721 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 722 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 723 724 paddsw xmm4, xmm7 725 paddsw xmm4, xmm5 726 727 paddsw xmm4, xmm3 728 paddsw xmm4, xmm6 729 730 paddsw xmm4, xmm1 731 paddsw xmm4, [GLOBAL(rd)] 732 733 psraw xmm4, 7 734 735 packuswb xmm4, xmm0 ; lower 8 bytes 736 737 movq QWORD Ptr [rdi], xmm4 ; store the results in the destination 738 739 movdqa xmm3, xmm2 740 movdqa xmm4, xmm2 741 742 movdqa xmm5, xmm2 743 movdqa xmm6, xmm2 744 745 movdqa xmm7, xmm2 746 747 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 748 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 749 750 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 751 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 752 753 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 754 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 755 756 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 757 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 758 759 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 760 761 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 762 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 763 764 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 765 766 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 767 psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 768 769 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 770 771 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 772 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 773 774 paddsw xmm4, xmm7 775 paddsw xmm4, xmm5 776 777 paddsw xmm4, xmm3 778 paddsw xmm4, xmm6 779 780 paddsw xmm4, xmm2 781 paddsw xmm4, [GLOBAL(rd)] 782 783 psraw xmm4, 7 784 785 packuswb xmm4, xmm0 ; higher 8 bytes 786 787 movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination 788 789 lea rsi, [rsi + rax] 790%if ABI_IS_32BIT 791 add rdi, DWORD Ptr arg(3) ;dst_ptich 792%else 793 add rdi, r8 794%endif 795 796 dec rcx 797 jnz .filter_block1d16_h6_only_sse2_rowloop ; next row 798 799 ; begin epilog 800 pop rdi 801 pop rsi 802 RESTORE_GOT 803 RESTORE_XMM 804 UNSHADOW_ARGS 805 pop rbp 806 ret 807 808 809;void vp8_filter_block1d8_v6_only_sse2 810;( 811; unsigned char *src_ptr, 812; unsigned int src_pixels_per_line, 813; unsigned char *output_ptr, 814; int dst_ptich, 815; unsigned int output_height, 816; const short *vp8_filter 817;) 818; Second-pass filter only when xoffset==0 819global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE 820sym(vp8_filter_block1d8_v6_only_sse2): 821 push rbp 822 mov rbp, rsp 823 SHADOW_ARGS_TO_STACK 6 824 SAVE_XMM 7 825 GET_GOT rbx 826 push rsi 827 push rdi 828 ; end prolog 829 830 mov rsi, arg(0) ;src_ptr 831 mov rdi, arg(2) ;output_ptr 832 833 movsxd rcx, dword ptr arg(4) ;output_height 834 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 835 836 mov rax, arg(5) ;vp8_filter 837 838 pxor xmm0, xmm0 ; clear xmm0 839 840 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 841%if ABI_IS_32BIT=0 842 movsxd r8, dword ptr arg(3) ; dst_ptich 843%endif 844 845.vp8_filter_block1d8_v6_only_sse2_loop: 846 movq xmm1, MMWORD PTR [rsi] 847 movq xmm2, MMWORD PTR [rsi + rdx] 848 movq xmm3, MMWORD PTR [rsi + rdx * 2] 849 movq xmm5, MMWORD PTR [rsi + rdx * 4] 850 add rsi, rdx 851 movq xmm4, MMWORD PTR [rsi + rdx * 2] 852 movq xmm6, MMWORD PTR [rsi + rdx * 4] 853 854 punpcklbw xmm1, xmm0 855 pmullw xmm1, [rax] 856 857 punpcklbw xmm2, xmm0 858 pmullw xmm2, [rax + 16] 859 860 punpcklbw xmm3, xmm0 861 pmullw xmm3, [rax + 32] 862 863 punpcklbw xmm5, xmm0 864 pmullw xmm5, [rax + 64] 865 866 punpcklbw xmm4, xmm0 867 pmullw xmm4, [rax + 48] 868 869 punpcklbw xmm6, xmm0 870 pmullw xmm6, [rax + 80] 871 872 paddsw xmm2, xmm5 873 paddsw xmm2, xmm3 874 875 paddsw xmm2, xmm1 876 paddsw xmm2, xmm4 877 878 paddsw xmm2, xmm6 879 paddsw xmm2, xmm7 880 881 psraw xmm2, 7 882 packuswb xmm2, xmm0 ; pack and saturate 883 884 movq QWORD PTR [rdi], xmm2 ; store the results in the destination 885%if ABI_IS_32BIT 886 add rdi, DWORD PTR arg(3) ;[dst_ptich] 887%else 888 add rdi, r8 889%endif 890 dec rcx ; decrement count 891 jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row 892 893 ; begin epilog 894 pop rdi 895 pop rsi 896 RESTORE_GOT 897 RESTORE_XMM 898 UNSHADOW_ARGS 899 pop rbp 900 ret 901 902 903;void vp8_unpack_block1d16_h6_sse2 904;( 905; unsigned char *src_ptr, 906; unsigned short *output_ptr, 907; unsigned int src_pixels_per_line, 908; unsigned int output_height, 909; unsigned int output_width 910;) 911global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE 912sym(vp8_unpack_block1d16_h6_sse2): 913 push rbp 914 mov rbp, rsp 915 SHADOW_ARGS_TO_STACK 5 916 GET_GOT rbx 917 push rsi 918 push rdi 919 ; end prolog 920 921 mov rsi, arg(0) ;src_ptr 922 mov rdi, arg(1) ;output_ptr 923 924 movsxd rcx, dword ptr arg(3) ;output_height 925 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 926 927 pxor xmm0, xmm0 ; clear xmm0 for unpack 928%if ABI_IS_32BIT=0 929 movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source 930%endif 931 932.unpack_block1d16_h6_sse2_rowloop: 933 movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 934 movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 935 936 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 937 punpcklbw xmm1, xmm0 938 939 movdqa XMMWORD Ptr [rdi], xmm1 940 movdqa XMMWORD Ptr [rdi + 16], xmm3 941 942 lea rsi, [rsi + rax] 943%if ABI_IS_32BIT 944 add rdi, DWORD Ptr arg(4) ;[output_width] 945%else 946 add rdi, r8 947%endif 948 dec rcx 949 jnz .unpack_block1d16_h6_sse2_rowloop ; next row 950 951 ; begin epilog 952 pop rdi 953 pop rsi 954 RESTORE_GOT 955 UNSHADOW_ARGS 956 pop rbp 957 ret 958 959 960SECTION_RODATA 961align 16 962rd: 963 times 8 dw 0x40 964