1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14 15%include "aom_ports/x86_abi_support.asm" 16 17;Note: tap3 and tap4 have to be applied and added after other taps to avoid 18;overflow. 19 20%macro GET_FILTERS_4 0 21 mov rdx, arg(5) ;filter ptr 22 mov rcx, 0x0400040 23 24 movdqa xmm7, [rdx] ;load filters 25 pshuflw xmm0, xmm7, 0b ;k0 26 pshuflw xmm1, xmm7, 01010101b ;k1 27 pshuflw xmm2, xmm7, 10101010b ;k2 28 pshuflw xmm3, xmm7, 11111111b ;k3 29 psrldq xmm7, 8 30 pshuflw xmm4, xmm7, 0b ;k4 31 pshuflw xmm5, xmm7, 01010101b ;k5 32 pshuflw xmm6, xmm7, 10101010b ;k6 33 pshuflw xmm7, xmm7, 11111111b ;k7 34 35 punpcklqdq xmm0, xmm1 36 punpcklqdq xmm2, xmm3 37 punpcklqdq xmm5, xmm4 38 punpcklqdq xmm6, xmm7 39 40 movdqa k0k1, xmm0 41 movdqa k2k3, xmm2 42 movdqa k5k4, xmm5 43 movdqa k6k7, xmm6 44 45 movq xmm6, rcx 46 pshufd xmm6, xmm6, 0 47 movdqa krd, xmm6 48 49 pxor xmm7, xmm7 50 movdqa zero, xmm7 51%endm 52 53%macro APPLY_FILTER_4 1 54 punpckldq xmm0, xmm1 ;two row in one register 55 punpckldq xmm6, xmm7 56 punpckldq xmm2, xmm3 57 punpckldq xmm5, xmm4 58 59 punpcklbw xmm0, zero ;unpack to word 60 punpcklbw xmm6, zero 61 punpcklbw xmm2, zero 62 punpcklbw xmm5, zero 63 64 pmullw xmm0, k0k1 ;multiply the filter factors 65 pmullw xmm6, k6k7 66 pmullw xmm2, k2k3 67 pmullw xmm5, k5k4 68 69 paddsw xmm0, xmm6 ;sum 70 movdqa xmm1, xmm0 71 psrldq xmm1, 8 72 paddsw xmm0, xmm1 73 paddsw xmm0, xmm2 74 psrldq xmm2, 8 75 paddsw xmm0, xmm5 76 psrldq xmm5, 8 77 paddsw xmm0, xmm2 78 paddsw xmm0, xmm5 79 80 paddsw xmm0, krd ;rounding 81 psraw xmm0, 7 ;shift 82 packuswb xmm0, xmm0 ;pack to byte 83 84%if %1 85 movd xmm1, [rdi] 86 pavgb xmm0, xmm1 87%endif 88 movd [rdi], xmm0 89%endm 90 91%macro GET_FILTERS 0 92 mov rdx, arg(5) ;filter ptr 93 mov rsi, arg(0) ;src_ptr 94 mov rdi, arg(2) ;output_ptr 95 mov rcx, 0x0400040 96 97 movdqa xmm7, [rdx] ;load filters 98 pshuflw xmm0, xmm7, 0b ;k0 99 pshuflw xmm1, xmm7, 01010101b ;k1 100 pshuflw xmm2, xmm7, 10101010b ;k2 101 pshuflw xmm3, xmm7, 11111111b ;k3 102 pshufhw xmm4, xmm7, 0b ;k4 103 pshufhw xmm5, xmm7, 01010101b ;k5 104 pshufhw xmm6, xmm7, 10101010b ;k6 105 pshufhw xmm7, xmm7, 11111111b ;k7 106 107 punpcklwd xmm0, xmm0 108 punpcklwd xmm1, xmm1 109 punpcklwd xmm2, xmm2 110 punpcklwd xmm3, xmm3 111 punpckhwd xmm4, xmm4 112 punpckhwd xmm5, xmm5 113 punpckhwd xmm6, xmm6 114 punpckhwd xmm7, xmm7 115 116 movdqa k0, xmm0 ;store filter factors on stack 117 movdqa k1, xmm1 118 movdqa k2, xmm2 119 movdqa k3, xmm3 120 movdqa k4, xmm4 121 movdqa k5, xmm5 122 movdqa k6, xmm6 123 movdqa k7, xmm7 124 125 movq xmm6, rcx 126 pshufd xmm6, xmm6, 0 127 movdqa krd, xmm6 ;rounding 128 129 pxor xmm7, xmm7 130 movdqa zero, xmm7 131%endm 132 133%macro LOAD_VERT_8 1 134 movq xmm0, [rsi + %1] ;0 135 movq xmm1, [rsi + rax + %1] ;1 136 movq xmm6, [rsi + rdx * 2 + %1] ;6 137 lea rsi, [rsi + rax] 138 movq xmm7, [rsi + rdx * 2 + %1] ;7 139 movq xmm2, [rsi + rax + %1] ;2 140 movq xmm3, [rsi + rax * 2 + %1] ;3 141 movq xmm4, [rsi + rdx + %1] ;4 142 movq xmm5, [rsi + rax * 4 + %1] ;5 143%endm 144 145%macro APPLY_FILTER_8 2 146 punpcklbw xmm0, zero 147 punpcklbw xmm1, zero 148 punpcklbw xmm6, zero 149 punpcklbw xmm7, zero 150 punpcklbw xmm2, zero 151 punpcklbw xmm5, zero 152 punpcklbw xmm3, zero 153 punpcklbw xmm4, zero 154 155 pmullw xmm0, k0 156 pmullw xmm1, k1 157 pmullw xmm6, k6 158 pmullw xmm7, k7 159 pmullw xmm2, k2 160 pmullw xmm5, k5 161 pmullw xmm3, k3 162 pmullw xmm4, k4 163 164 paddsw xmm0, xmm1 165 paddsw xmm0, xmm6 166 paddsw xmm0, xmm7 167 paddsw xmm0, xmm2 168 paddsw xmm0, xmm5 169 paddsw xmm0, xmm3 170 paddsw xmm0, xmm4 171 172 paddsw xmm0, krd ;rounding 173 psraw xmm0, 7 ;shift 174 packuswb xmm0, xmm0 ;pack back to byte 175%if %1 176 movq xmm1, [rdi + %2] 177 pavgb xmm0, xmm1 178%endif 179 movq [rdi + %2], xmm0 180%endm 181 182SECTION .text 183 184;void aom_filter_block1d4_v8_sse2 185;( 186; unsigned char *src_ptr, 187; unsigned int src_pitch, 188; unsigned char *output_ptr, 189; unsigned int out_pitch, 190; unsigned int output_height, 191; short *filter 192;) 193global sym(aom_filter_block1d4_v8_sse2) PRIVATE 194sym(aom_filter_block1d4_v8_sse2): 195 push rbp 196 mov rbp, rsp 197 SHADOW_ARGS_TO_STACK 6 198 SAVE_XMM 7 199 push rsi 200 push rdi 201 push rbx 202 ; end prolog 203 204 ALIGN_STACK 16, rax 205 sub rsp, 16 * 6 206 %define k0k1 [rsp + 16 * 0] 207 %define k2k3 [rsp + 16 * 1] 208 %define k5k4 [rsp + 16 * 2] 209 %define k6k7 [rsp + 16 * 3] 210 %define krd [rsp + 16 * 4] 211 %define zero [rsp + 16 * 5] 212 213 GET_FILTERS_4 214 215 mov rsi, arg(0) ;src_ptr 216 mov rdi, arg(2) ;output_ptr 217 218 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 219 movsxd rbx, DWORD PTR arg(3) ;out_pitch 220 lea rdx, [rax + rax * 2] 221 movsxd rcx, DWORD PTR arg(4) ;output_height 222 223.loop: 224 movd xmm0, [rsi] ;load src: row 0 225 movd xmm1, [rsi + rax] ;1 226 movd xmm6, [rsi + rdx * 2] ;6 227 lea rsi, [rsi + rax] 228 movd xmm7, [rsi + rdx * 2] ;7 229 movd xmm2, [rsi + rax] ;2 230 movd xmm3, [rsi + rax * 2] ;3 231 movd xmm4, [rsi + rdx] ;4 232 movd xmm5, [rsi + rax * 4] ;5 233 234 APPLY_FILTER_4 0 235 236 lea rdi, [rdi + rbx] 237 dec rcx 238 jnz .loop 239 240 add rsp, 16 * 6 241 pop rsp 242 pop rbx 243 ; begin epilog 244 pop rdi 245 pop rsi 246 RESTORE_XMM 247 UNSHADOW_ARGS 248 pop rbp 249 ret 250 251;void aom_filter_block1d8_v8_sse2 252;( 253; unsigned char *src_ptr, 254; unsigned int src_pitch, 255; unsigned char *output_ptr, 256; unsigned int out_pitch, 257; unsigned int output_height, 258; short *filter 259;) 260global sym(aom_filter_block1d8_v8_sse2) PRIVATE 261sym(aom_filter_block1d8_v8_sse2): 262 push rbp 263 mov rbp, rsp 264 SHADOW_ARGS_TO_STACK 6 265 SAVE_XMM 7 266 push rsi 267 push rdi 268 push rbx 269 ; end prolog 270 271 ALIGN_STACK 16, rax 272 sub rsp, 16 * 10 273 %define k0 [rsp + 16 * 0] 274 %define k1 [rsp + 16 * 1] 275 %define k2 [rsp + 16 * 2] 276 %define k3 [rsp + 16 * 3] 277 %define k4 [rsp + 16 * 4] 278 %define k5 [rsp + 16 * 5] 279 %define k6 [rsp + 16 * 6] 280 %define k7 [rsp + 16 * 7] 281 %define krd [rsp + 16 * 8] 282 %define zero [rsp + 16 * 9] 283 284 GET_FILTERS 285 286 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 287 movsxd rbx, DWORD PTR arg(3) ;out_pitch 288 lea rdx, [rax + rax * 2] 289 movsxd rcx, DWORD PTR arg(4) ;output_height 290 291.loop: 292 LOAD_VERT_8 0 293 APPLY_FILTER_8 0, 0 294 295 lea rdi, [rdi + rbx] 296 dec rcx 297 jnz .loop 298 299 add rsp, 16 * 10 300 pop rsp 301 pop rbx 302 ; begin epilog 303 pop rdi 304 pop rsi 305 RESTORE_XMM 306 UNSHADOW_ARGS 307 pop rbp 308 ret 309 310;void aom_filter_block1d16_v8_sse2 311;( 312; unsigned char *src_ptr, 313; unsigned int src_pitch, 314; unsigned char *output_ptr, 315; unsigned int out_pitch, 316; unsigned int output_height, 317; short *filter 318;) 319global sym(aom_filter_block1d16_v8_sse2) PRIVATE 320sym(aom_filter_block1d16_v8_sse2): 321 push rbp 322 mov rbp, rsp 323 SHADOW_ARGS_TO_STACK 6 324 SAVE_XMM 7 325 push rsi 326 push rdi 327 push rbx 328 ; end prolog 329 330 ALIGN_STACK 16, rax 331 sub rsp, 16 * 10 332 %define k0 [rsp + 16 * 0] 333 %define k1 [rsp + 16 * 1] 334 %define k2 [rsp + 16 * 2] 335 %define k3 [rsp + 16 * 3] 336 %define k4 [rsp + 16 * 4] 337 %define k5 [rsp + 16 * 5] 338 %define k6 [rsp + 16 * 6] 339 %define k7 [rsp + 16 * 7] 340 %define krd [rsp + 16 * 8] 341 %define zero [rsp + 16 * 9] 342 343 GET_FILTERS 344 345 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 346 movsxd rbx, DWORD PTR arg(3) ;out_pitch 347 lea rdx, [rax + rax * 2] 348 movsxd rcx, DWORD PTR arg(4) ;output_height 349 350.loop: 351 LOAD_VERT_8 0 352 APPLY_FILTER_8 0, 0 353 sub rsi, rax 354 355 LOAD_VERT_8 8 356 APPLY_FILTER_8 0, 8 357 add rdi, rbx 358 359 dec rcx 360 jnz .loop 361 362 add rsp, 16 * 10 363 pop rsp 364 pop rbx 365 ; begin epilog 366 pop rdi 367 pop rsi 368 RESTORE_XMM 369 UNSHADOW_ARGS 370 pop rbp 371 ret 372 373;void aom_filter_block1d4_h8_sse2 374;( 375; unsigned char *src_ptr, 376; unsigned int src_pixels_per_line, 377; unsigned char *output_ptr, 378; unsigned int output_pitch, 379; unsigned int output_height, 380; short *filter 381;) 382global sym(aom_filter_block1d4_h8_sse2) PRIVATE 383sym(aom_filter_block1d4_h8_sse2): 384 push rbp 385 mov rbp, rsp 386 SHADOW_ARGS_TO_STACK 6 387 SAVE_XMM 7 388 push rsi 389 push rdi 390 ; end prolog 391 392 ALIGN_STACK 16, rax 393 sub rsp, 16 * 6 394 %define k0k1 [rsp + 16 * 0] 395 %define k2k3 [rsp + 16 * 1] 396 %define k5k4 [rsp + 16 * 2] 397 %define k6k7 [rsp + 16 * 3] 398 %define krd [rsp + 16 * 4] 399 %define zero [rsp + 16 * 5] 400 401 GET_FILTERS_4 402 403 mov rsi, arg(0) ;src_ptr 404 mov rdi, arg(2) ;output_ptr 405 406 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 407 movsxd rdx, DWORD PTR arg(3) ;out_pitch 408 movsxd rcx, DWORD PTR arg(4) ;output_height 409 410.loop: 411 movdqu xmm0, [rsi - 3] ;load src 412 413 movdqa xmm1, xmm0 414 movdqa xmm6, xmm0 415 movdqa xmm7, xmm0 416 movdqa xmm2, xmm0 417 movdqa xmm3, xmm0 418 movdqa xmm5, xmm0 419 movdqa xmm4, xmm0 420 421 psrldq xmm1, 1 422 psrldq xmm6, 6 423 psrldq xmm7, 7 424 psrldq xmm2, 2 425 psrldq xmm3, 3 426 psrldq xmm5, 5 427 psrldq xmm4, 4 428 429 APPLY_FILTER_4 0 430 431 lea rsi, [rsi + rax] 432 lea rdi, [rdi + rdx] 433 dec rcx 434 jnz .loop 435 436 add rsp, 16 * 6 437 pop rsp 438 439 ; begin epilog 440 pop rdi 441 pop rsi 442 RESTORE_XMM 443 UNSHADOW_ARGS 444 pop rbp 445 ret 446 447;void aom_filter_block1d8_h8_sse2 448;( 449; unsigned char *src_ptr, 450; unsigned int src_pixels_per_line, 451; unsigned char *output_ptr, 452; unsigned int output_pitch, 453; unsigned int output_height, 454; short *filter 455;) 456global sym(aom_filter_block1d8_h8_sse2) PRIVATE 457sym(aom_filter_block1d8_h8_sse2): 458 push rbp 459 mov rbp, rsp 460 SHADOW_ARGS_TO_STACK 6 461 SAVE_XMM 7 462 push rsi 463 push rdi 464 ; end prolog 465 466 ALIGN_STACK 16, rax 467 sub rsp, 16 * 10 468 %define k0 [rsp + 16 * 0] 469 %define k1 [rsp + 16 * 1] 470 %define k2 [rsp + 16 * 2] 471 %define k3 [rsp + 16 * 3] 472 %define k4 [rsp + 16 * 4] 473 %define k5 [rsp + 16 * 5] 474 %define k6 [rsp + 16 * 6] 475 %define k7 [rsp + 16 * 7] 476 %define krd [rsp + 16 * 8] 477 %define zero [rsp + 16 * 9] 478 479 GET_FILTERS 480 481 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 482 movsxd rdx, DWORD PTR arg(3) ;out_pitch 483 movsxd rcx, DWORD PTR arg(4) ;output_height 484 485.loop: 486 movdqu xmm0, [rsi - 3] ;load src 487 488 movdqa xmm1, xmm0 489 movdqa xmm6, xmm0 490 movdqa xmm7, xmm0 491 movdqa xmm2, xmm0 492 movdqa xmm5, xmm0 493 movdqa xmm3, xmm0 494 movdqa xmm4, xmm0 495 496 psrldq xmm1, 1 497 psrldq xmm6, 6 498 psrldq xmm7, 7 499 psrldq xmm2, 2 500 psrldq xmm5, 5 501 psrldq xmm3, 3 502 psrldq xmm4, 4 503 504 APPLY_FILTER_8 0, 0 505 506 lea rsi, [rsi + rax] 507 lea rdi, [rdi + rdx] 508 dec rcx 509 jnz .loop 510 511 add rsp, 16 * 10 512 pop rsp 513 514 ; begin epilog 515 pop rdi 516 pop rsi 517 RESTORE_XMM 518 UNSHADOW_ARGS 519 pop rbp 520 ret 521 522;void aom_filter_block1d16_h8_sse2 523;( 524; unsigned char *src_ptr, 525; unsigned int src_pixels_per_line, 526; unsigned char *output_ptr, 527; unsigned int output_pitch, 528; unsigned int output_height, 529; short *filter 530;) 531global sym(aom_filter_block1d16_h8_sse2) PRIVATE 532sym(aom_filter_block1d16_h8_sse2): 533 push rbp 534 mov rbp, rsp 535 SHADOW_ARGS_TO_STACK 6 536 SAVE_XMM 7 537 push rsi 538 push rdi 539 ; end prolog 540 541 ALIGN_STACK 16, rax 542 sub rsp, 16 * 10 543 %define k0 [rsp + 16 * 0] 544 %define k1 [rsp + 16 * 1] 545 %define k2 [rsp + 16 * 2] 546 %define k3 [rsp + 16 * 3] 547 %define k4 [rsp + 16 * 4] 548 %define k5 [rsp + 16 * 5] 549 %define k6 [rsp + 16 * 6] 550 %define k7 [rsp + 16 * 7] 551 %define krd [rsp + 16 * 8] 552 %define zero [rsp + 16 * 9] 553 554 GET_FILTERS 555 556 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 557 movsxd rdx, DWORD PTR arg(3) ;out_pitch 558 movsxd rcx, DWORD PTR arg(4) ;output_height 559 560.loop: 561 movdqu xmm0, [rsi - 3] ;load src 562 563 movdqa xmm1, xmm0 564 movdqa xmm6, xmm0 565 movdqa xmm7, xmm0 566 movdqa xmm2, xmm0 567 movdqa xmm5, xmm0 568 movdqa xmm3, xmm0 569 movdqa xmm4, xmm0 570 571 psrldq xmm1, 1 572 psrldq xmm6, 6 573 psrldq xmm7, 7 574 psrldq xmm2, 2 575 psrldq xmm5, 5 576 psrldq xmm3, 3 577 psrldq xmm4, 4 578 579 APPLY_FILTER_8 0, 0 580 581 movdqu xmm0, [rsi + 5] ;load src 582 583 movdqa xmm1, xmm0 584 movdqa xmm6, xmm0 585 movdqa xmm7, xmm0 586 movdqa xmm2, xmm0 587 movdqa xmm5, xmm0 588 movdqa xmm3, xmm0 589 movdqa xmm4, xmm0 590 591 psrldq xmm1, 1 592 psrldq xmm6, 6 593 psrldq xmm7, 7 594 psrldq xmm2, 2 595 psrldq xmm5, 5 596 psrldq xmm3, 3 597 psrldq xmm4, 4 598 599 APPLY_FILTER_8 0, 8 600 601 lea rsi, [rsi + rax] 602 lea rdi, [rdi + rdx] 603 dec rcx 604 jnz .loop 605 606 add rsp, 16 * 10 607 pop rsp 608 609 ; begin epilog 610 pop rdi 611 pop rsi 612 RESTORE_XMM 613 UNSHADOW_ARGS 614 pop rbp 615 ret 616