1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13%define _t0 0 14%define _t1 _t0 + 16 15%define _p3 _t1 + 16 16%define _p2 _p3 + 16 17%define _p1 _p2 + 16 18%define _p0 _p1 + 16 19%define _q0 _p0 + 16 20%define _q1 _q0 + 16 21%define _q2 _q1 + 16 22%define _q3 _q2 + 16 23%define lf_var_size 160 24 25; Use of pmaxub instead of psubusb to compute filter mask was seen 26; in ffvp8 27 28%macro LFH_FILTER_AND_HEV_MASK 1 29%if %1 30 movdqa xmm2, [rdi+2*rax] ; q3 31 movdqa xmm1, [rsi+2*rax] ; q2 32 movdqa xmm4, [rsi+rax] ; q1 33 movdqa xmm5, [rsi] ; q0 34 neg rax ; negate pitch to deal with above border 35%else 36 movlps xmm2, [rsi + rcx*2] ; q3 37 movlps xmm1, [rsi + rcx] ; q2 38 movlps xmm4, [rsi] ; q1 39 movlps xmm5, [rsi + rax] ; q0 40 41 movhps xmm2, [rdi + rcx*2] 42 movhps xmm1, [rdi + rcx] 43 movhps xmm4, [rdi] 44 movhps xmm5, [rdi + rax] 45 46 lea rsi, [rsi + rax*4] 47 lea rdi, [rdi + rax*4] 48 49 movdqa [rsp+_q2], xmm1 ; store q2 50 movdqa [rsp+_q1], xmm4 ; store q1 51%endif 52 movdqa xmm7, [rdx] ;limit 53 54 movdqa xmm6, xmm1 ; q2 55 movdqa xmm3, xmm4 ; q1 56 57 psubusb xmm1, xmm2 ; q2-=q3 58 psubusb xmm2, xmm6 ; q3-=q2 59 60 psubusb xmm4, xmm6 ; q1-=q2 61 psubusb xmm6, xmm3 ; q2-=q1 62 63 por xmm4, xmm6 ; abs(q2-q1) 64 por xmm1, xmm2 ; abs(q3-q2) 65 66 movdqa xmm0, xmm5 ; q0 67 pmaxub xmm1, xmm4 68 69 psubusb xmm5, xmm3 ; q0-=q1 70 psubusb xmm3, xmm0 ; q1-=q0 71 72 por xmm5, xmm3 ; abs(q0-q1) 73 movdqa [rsp+_t0], xmm5 ; save to t0 74 75 pmaxub xmm1, xmm5 76 77%if %1 78 movdqa xmm2, [rsi+4*rax] ; p3 79 movdqa xmm4, [rdi+4*rax] ; p2 80 movdqa xmm6, [rsi+2*rax] ; p1 81%else 82 movlps xmm2, [rsi + rax] ; p3 83 movlps xmm4, [rsi] ; p2 84 movlps xmm6, [rsi + rcx] ; p1 85 86 movhps xmm2, [rdi + rax] 87 movhps xmm4, [rdi] 88 movhps xmm6, [rdi + rcx] 89 90 movdqa [rsp+_p2], xmm4 ; store p2 91 movdqa [rsp+_p1], xmm6 ; store p1 92%endif 93 94 movdqa xmm5, xmm4 ; p2 95 movdqa xmm3, xmm6 ; p1 96 97 psubusb xmm4, xmm2 ; p2-=p3 98 psubusb xmm2, xmm5 ; p3-=p2 99 100 psubusb xmm3, xmm5 ; p1-=p2 101 pmaxub xmm1, xmm4 ; abs(p3 - p2) 102 103 psubusb xmm5, xmm6 ; p2-=p1 104 pmaxub xmm1, xmm2 ; abs(p3 - p2) 105 106 pmaxub xmm1, xmm5 ; abs(p2 - p1) 107 movdqa xmm2, xmm6 ; p1 108 109 pmaxub xmm1, xmm3 ; abs(p2 - p1) 110%if %1 111 movdqa xmm4, [rsi+rax] ; p0 112 movdqa xmm3, [rdi] ; q1 113%else 114 movlps xmm4, [rsi + rcx*2] ; p0 115 movhps xmm4, [rdi + rcx*2] 116 movdqa xmm3, [rsp+_q1] ; q1 117%endif 118 119 movdqa xmm5, xmm4 ; p0 120 psubusb xmm4, xmm6 ; p0-=p1 121 122 psubusb xmm6, xmm5 ; p1-=p0 123 124 por xmm6, xmm4 ; abs(p1 - p0) 125 mov rdx, arg(2) ; get blimit 126 127 movdqa [rsp+_t1], xmm6 ; save to t1 128 129 movdqa xmm4, xmm3 ; q1 130 pmaxub xmm1, xmm6 131 132 psubusb xmm3, xmm2 ; q1-=p1 133 psubusb xmm2, xmm4 ; p1-=q1 134 135 psubusb xmm1, xmm7 136 por xmm2, xmm3 ; abs(p1-q1) 137 138 movdqa xmm7, [rdx] ; blimit 139 mov rdx, arg(4) ; hev get thresh 140 141 movdqa xmm3, xmm0 ; q0 142 pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero 143 144 movdqa xmm6, xmm5 ; p0 145 psrlw xmm2, 1 ; abs(p1-q1)/2 146 147 psubusb xmm5, xmm3 ; p0-=q0 148 psubusb xmm3, xmm6 ; q0-=p0 149 por xmm5, xmm3 ; abs(p0 - q0) 150 151 paddusb xmm5, xmm5 ; abs(p0-q0)*2 152 153 movdqa xmm4, [rsp+_t0] ; hev get abs (q1 - q0) 154 movdqa xmm3, [rsp+_t1] ; get abs (p1 - p0) 155 156 paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 157 158 movdqa xmm2, [rdx] ; hev 159 160 psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit 161 psubusb xmm4, xmm2 ; hev 162 163 psubusb xmm3, xmm2 ; hev 164 por xmm1, xmm5 165 166 pxor xmm7, xmm7 167 paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 168 169 pcmpeqb xmm4, xmm5 ; hev 170 pcmpeqb xmm3, xmm3 ; hev 171 172 pcmpeqb xmm1, xmm7 ; mask xmm1 173 pxor xmm4, xmm3 ; hev 174%endmacro 175 176%macro B_FILTER 1 177 movdqa xmm3, [GLOBAL(t80)] 178%if %1 == 0 179 movdqa xmm2, [rsp+_p1] ; p1 180 movdqa xmm7, [rsp+_q1] ; q1 181%elif %1 == 1 182 movdqa xmm2, [rsi+2*rax] ; p1 183 movdqa xmm7, [rdi] ; q1 184%elif %1 == 2 185 movdqa xmm2, [rsp+_p1] ; p1 186 movdqa xmm6, [rsp+_p0] ; p0 187 movdqa xmm0, [rsp+_q0] ; q0 188 movdqa xmm7, [rsp+_q1] ; q1 189%endif 190 191 pxor xmm2, xmm3 ; p1 offset to convert to signed values 192 pxor xmm7, xmm3 ; q1 offset to convert to signed values 193 194 psubsb xmm2, xmm7 ; p1 - q1 195 pxor xmm6, xmm3 ; offset to convert to signed values 196 197 pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) 198 pxor xmm0, xmm3 ; offset to convert to signed values 199 200 movdqa xmm3, xmm0 ; q0 201 psubsb xmm0, xmm6 ; q0 - p0 202 paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) 203 paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) 204 paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) 205 pand xmm1, xmm2 ; mask filter values we don't care about 206 207 movdqa xmm2, xmm1 208 paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 209 paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 210 211 punpckhbw xmm5, xmm2 ; axbxcxdx 212 punpcklbw xmm2, xmm2 ; exfxgxhx 213 214 punpcklbw xmm0, xmm1 ; exfxgxhx 215 psraw xmm5, 11 ; sign extended shift right by 3 216 217 punpckhbw xmm1, xmm1 ; axbxcxdx 218 psraw xmm2, 11 ; sign extended shift right by 3 219 220 packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; 221 psraw xmm0, 11 ; sign extended shift right by 3 222 223 psraw xmm1, 11 ; sign extended shift right by 3 224 movdqa xmm5, xmm0 ; save results 225 226 packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 227 228 paddsb xmm6, xmm2 ; p0+= p0 add 229 230 movdqa xmm2, [GLOBAL(ones)] 231 paddsw xmm5, xmm2 232 paddsw xmm1, xmm2 233 psraw xmm5, 1 ; partial shifted one more time for 2nd tap 234 psraw xmm1, 1 ; partial shifted one more time for 2nd tap 235 packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 236 movdqa xmm2, [GLOBAL(t80)] 237 238%if %1 == 0 239 movdqa xmm1, [rsp+_p1] ; p1 240 lea rsi, [rsi + rcx*2] 241 lea rdi, [rdi + rcx*2] 242%elif %1 == 1 243 movdqa xmm1, [rsi+2*rax] ; p1 244%elif %1 == 2 245 movdqa xmm1, [rsp+_p1] ; p1 246%endif 247 248 pandn xmm4, xmm5 ; high edge variance additive 249 pxor xmm6, xmm2 ; unoffset 250 251 pxor xmm1, xmm2 ; reoffset 252 psubsb xmm3, xmm0 ; q0-= q0 add 253 254 paddsb xmm1, xmm4 ; p1+= p1 add 255 pxor xmm3, xmm2 ; unoffset 256 257 pxor xmm1, xmm2 ; unoffset 258 psubsb xmm7, xmm4 ; q1-= q1 add 259 260 pxor xmm7, xmm2 ; unoffset 261%if %1 == 0 262 movq [rsi], xmm6 ; p0 263 movhps [rdi], xmm6 264 movq [rsi + rax], xmm1 ; p1 265 movhps [rdi + rax], xmm1 266 movq [rsi + rcx], xmm3 ; q0 267 movhps [rdi + rcx], xmm3 268 movq [rsi + rcx*2], xmm7 ; q1 269 movhps [rdi + rcx*2], xmm7 270%elif %1 == 1 271 movdqa [rsi+rax], xmm6 ; write back 272 movdqa [rsi+2*rax], xmm1 ; write back 273 movdqa [rsi], xmm3 ; write back 274 movdqa [rdi], xmm7 ; write back 275%endif 276 277%endmacro 278 279SECTION .text 280 281%if ABI_IS_32BIT 282 283;void vp8_loop_filter_horizontal_edge_sse2 284;( 285; unsigned char *src_ptr, 286; int src_pixel_step, 287; const char *blimit, 288; const char *limit, 289; const char *thresh, 290;) 291global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE 292sym(vp8_loop_filter_horizontal_edge_sse2): 293 push rbp 294 mov rbp, rsp 295 SHADOW_ARGS_TO_STACK 5 296 SAVE_XMM 7 297 GET_GOT rbx 298 push rsi 299 push rdi 300 ; end prolog 301 302 ALIGN_STACK 16, rax 303 sub rsp, lf_var_size 304 305 mov rsi, arg(0) ;src_ptr 306 movsxd rax, dword ptr arg(1) ;src_pixel_step 307 308 mov rdx, arg(3) ;limit 309 310 lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing 311 312 ; calculate breakout conditions and high edge variance 313 LFH_FILTER_AND_HEV_MASK 1 314 ; filter and write back the result 315 B_FILTER 1 316 317 add rsp, lf_var_size 318 pop rsp 319 ; begin epilog 320 pop rdi 321 pop rsi 322 RESTORE_GOT 323 RESTORE_XMM 324 UNSHADOW_ARGS 325 pop rbp 326 ret 327 328%endif 329 330;void vp8_loop_filter_horizontal_edge_uv_sse2 331;( 332; unsigned char *src_ptr, 333; int src_pixel_step, 334; const char *blimit, 335; const char *limit, 336; const char *thresh, 337; int count 338;) 339global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE 340sym(vp8_loop_filter_horizontal_edge_uv_sse2): 341 push rbp 342 mov rbp, rsp 343 SHADOW_ARGS_TO_STACK 6 344 SAVE_XMM 7 345 GET_GOT rbx 346 push rsi 347 push rdi 348 ; end prolog 349 350 ALIGN_STACK 16, rax 351 sub rsp, lf_var_size 352 353 mov rsi, arg(0) ; u 354 mov rdi, arg(5) ; v 355 movsxd rax, dword ptr arg(1) ; src_pixel_step 356 mov rcx, rax 357 neg rax ; negate pitch to deal with above border 358 359 mov rdx, arg(3) ;limit 360 361 lea rsi, [rsi + rcx] 362 lea rdi, [rdi + rcx] 363 364 ; calculate breakout conditions and high edge variance 365 LFH_FILTER_AND_HEV_MASK 0 366 ; filter and write back the result 367 B_FILTER 0 368 369 add rsp, lf_var_size 370 pop rsp 371 ; begin epilog 372 pop rdi 373 pop rsi 374 RESTORE_GOT 375 RESTORE_XMM 376 UNSHADOW_ARGS 377 pop rbp 378 ret 379 380 381%macro MB_FILTER_AND_WRITEBACK 1 382 movdqa xmm3, [GLOBAL(t80)] 383%if %1 == 0 384 movdqa xmm2, [rsp+_p1] ; p1 385 movdqa xmm7, [rsp+_q1] ; q1 386%elif %1 == 1 387 movdqa xmm2, [rsi+2*rax] ; p1 388 movdqa xmm7, [rdi] ; q1 389 390 mov rcx, rax 391 neg rcx 392%elif %1 == 2 393 movdqa xmm2, [rsp+_p1] ; p1 394 movdqa xmm6, [rsp+_p0] ; p0 395 movdqa xmm0, [rsp+_q0] ; q0 396 movdqa xmm7, [rsp+_q1] ; q1 397%endif 398 399 pxor xmm2, xmm3 ; p1 offset to convert to signed values 400 pxor xmm7, xmm3 ; q1 offset to convert to signed values 401 pxor xmm6, xmm3 ; offset to convert to signed values 402 pxor xmm0, xmm3 ; offset to convert to signed values 403 404 psubsb xmm2, xmm7 ; p1 - q1 405 406 movdqa xmm3, xmm0 ; q0 407 psubsb xmm0, xmm6 ; q0 - p0 408 paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) 409 paddsb xmm2, xmm0 ; 2 * (q0 - p0) 410 paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1) 411 pand xmm1, xmm2 ; mask filter values we don't care about 412 413 movdqa xmm2, xmm1 ; vp8_filter 414 415 pand xmm2, xmm4 ; Filter2 = vp8_filter & hev 416 pxor xmm0, xmm0 417 418 pandn xmm4, xmm1 ; vp8_filter&=~hev 419 pxor xmm1, xmm1 420 421 punpcklbw xmm0, xmm4 ; Filter 2 (hi) 422 punpckhbw xmm1, xmm4 ; Filter 2 (lo) 423 424 movdqa xmm5, xmm2 425 426 movdqa xmm4, [GLOBAL(s9)] 427 paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3) 428 paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) 429 430 pmulhw xmm1, xmm4 ; Filter 2 (lo) * 9 431 pmulhw xmm0, xmm4 ; Filter 2 (hi) * 9 432 433 punpckhbw xmm7, xmm5 ; axbxcxdx 434 punpcklbw xmm5, xmm5 ; exfxgxhx 435 436 psraw xmm7, 11 ; sign extended shift right by 3 437 438 psraw xmm5, 11 ; sign extended shift right by 3 439 punpckhbw xmm4, xmm2 ; axbxcxdx 440 441 punpcklbw xmm2, xmm2 ; exfxgxhx 442 psraw xmm4, 11 ; sign extended shift right by 3 443 444 packsswb xmm5, xmm7 ; Filter2 >>=3; 445 psraw xmm2, 11 ; sign extended shift right by 3 446 447 packsswb xmm2, xmm4 ; Filter1 >>=3; 448 449 paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 450 451 psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1 452 movdqa xmm7, xmm1 453 454 movdqa xmm4, [GLOBAL(s63)] 455 movdqa xmm5, xmm0 456 movdqa xmm2, xmm5 457 paddw xmm0, xmm4 ; Filter 2 (hi) * 9 + 63 458 paddw xmm1, xmm4 ; Filter 2 (lo) * 9 + 63 459 movdqa xmm4, xmm7 460 461 paddw xmm5, xmm5 ; Filter 2 (hi) * 18 462 463 paddw xmm7, xmm7 ; Filter 2 (lo) * 18 464 paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63 465 466 paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63 467 paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63 468 psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7 469 470 paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63 471 psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7 472 psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7 473 474 packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) 475 476 psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7 477 psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7 478 psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7 479 480 packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) 481 packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) 482 movdqa xmm7, [GLOBAL(t80)] 483 484%if %1 == 0 485 movdqa xmm1, [rsp+_q1] ; q1 486 movdqa xmm4, [rsp+_p1] ; p1 487 lea rsi, [rsi+rcx*2] 488 lea rdi, [rdi+rcx*2] 489 490%elif %1 == 1 491 movdqa xmm1, [rdi] ; q1 492 movdqa xmm4, [rsi+rax*2] ; p1 493%elif %1 == 2 494 movdqa xmm4, [rsp+_p1] ; p1 495 movdqa xmm1, [rsp+_q1] ; q1 496%endif 497 498 pxor xmm1, xmm7 499 pxor xmm4, xmm7 500 501 psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3) 502 paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3) 503 psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2) 504 paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2) 505 506%if %1 == 1 507 movdqa xmm2, [rdi+rax*4] ; p2 508 movdqa xmm5, [rdi+rcx] ; q2 509%else 510 movdqa xmm2, [rsp+_p2] ; p2 511 movdqa xmm5, [rsp+_q2] ; q2 512%endif 513 514 pxor xmm1, xmm7 ; *oq1 = sq^0x80; 515 pxor xmm4, xmm7 ; *op1 = sp^0x80; 516 pxor xmm2, xmm7 517 pxor xmm5, xmm7 518 paddsb xmm2, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u) 519 psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u) 520 pxor xmm2, xmm7 ; *op2 = sp^0x80; 521 pxor xmm5, xmm7 ; *oq2 = sq^0x80; 522 pxor xmm3, xmm7 ; *oq0 = sq^0x80 523 pxor xmm6, xmm7 ; *oq0 = sp^0x80 524%if %1 == 0 525 movq [rsi], xmm6 ; p0 526 movhps [rdi], xmm6 527 movq [rsi + rcx], xmm3 ; q0 528 movhps [rdi + rcx], xmm3 529 lea rdx, [rcx + rcx*2] 530 movq [rsi+rcx*2], xmm1 ; q1 531 movhps [rdi+rcx*2], xmm1 532 533 movq [rsi + rax], xmm4 ; p1 534 movhps [rdi + rax], xmm4 535 536 movq [rsi+rax*2], xmm2 ; p2 537 movhps [rdi+rax*2], xmm2 538 539 movq [rsi+rdx], xmm5 ; q2 540 movhps [rdi+rdx], xmm5 541%elif %1 == 1 542 movdqa [rdi+rcx], xmm5 ; q2 543 movdqa [rdi], xmm1 ; q1 544 movdqa [rsi], xmm3 ; q0 545 movdqa [rsi+rax ], xmm6 ; p0 546 movdqa [rsi+rax*2], xmm4 ; p1 547 movdqa [rdi+rax*4], xmm2 ; p2 548%elif %1 == 2 549 movdqa [rsp+_p1], xmm4 ; p1 550 movdqa [rsp+_p0], xmm6 ; p0 551 movdqa [rsp+_q0], xmm3 ; q0 552 movdqa [rsp+_q1], xmm1 ; q1 553%endif 554 555%endmacro 556 557 558;void vp8_mbloop_filter_horizontal_edge_sse2 559;( 560; unsigned char *src_ptr, 561; int src_pixel_step, 562; const char *blimit, 563; const char *limit, 564; const char *thresh, 565;) 566global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE 567sym(vp8_mbloop_filter_horizontal_edge_sse2): 568 push rbp 569 mov rbp, rsp 570 SHADOW_ARGS_TO_STACK 5 571 SAVE_XMM 7 572 GET_GOT rbx 573 push rsi 574 push rdi 575 ; end prolog 576 577 ALIGN_STACK 16, rax 578 sub rsp, lf_var_size 579 580 mov rsi, arg(0) ;src_ptr 581 movsxd rax, dword ptr arg(1) ;src_pixel_step 582 mov rdx, arg(3) ;limit 583 584 lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing 585 586 ; calculate breakout conditions and high edge variance 587 LFH_FILTER_AND_HEV_MASK 1 588 ; filter and write back the results 589 MB_FILTER_AND_WRITEBACK 1 590 591 add rsp, lf_var_size 592 pop rsp 593 ; begin epilog 594 pop rdi 595 pop rsi 596 RESTORE_GOT 597 RESTORE_XMM 598 UNSHADOW_ARGS 599 pop rbp 600 ret 601 602 603;void vp8_mbloop_filter_horizontal_edge_uv_sse2 604;( 605; unsigned char *u, 606; int src_pixel_step, 607; const char *blimit, 608; const char *limit, 609; const char *thresh, 610; unsigned char *v 611;) 612global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE 613sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): 614 push rbp 615 mov rbp, rsp 616 SHADOW_ARGS_TO_STACK 6 617 SAVE_XMM 7 618 GET_GOT rbx 619 push rsi 620 push rdi 621 ; end prolog 622 623 ALIGN_STACK 16, rax 624 sub rsp, lf_var_size 625 626 mov rsi, arg(0) ; u 627 mov rdi, arg(5) ; v 628 movsxd rax, dword ptr arg(1) ; src_pixel_step 629 mov rcx, rax 630 neg rax ; negate pitch to deal with above border 631 mov rdx, arg(3) ;limit 632 633 lea rsi, [rsi + rcx] 634 lea rdi, [rdi + rcx] 635 636 ; calculate breakout conditions and high edge variance 637 LFH_FILTER_AND_HEV_MASK 0 638 ; filter and write back the results 639 MB_FILTER_AND_WRITEBACK 0 640 641 add rsp, lf_var_size 642 pop rsp 643 ; begin epilog 644 pop rdi 645 pop rsi 646 RESTORE_GOT 647 RESTORE_XMM 648 UNSHADOW_ARGS 649 pop rbp 650 ret 651 652 653%macro TRANSPOSE_16X8 2 654 movq xmm4, [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 655 movq xmm1, [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 656 movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 657 movq xmm7, [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 658 movq xmm5, [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 659 movq xmm2, [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 660 661 punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 662 663 movq xmm1, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 664 665 movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 666 punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 667 668 movq xmm7, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 669 670 punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 671%if %1 672 lea rsi, [rsi+rax*8] 673 lea rdi, [rdi+rax*8] 674%else 675 mov rsi, arg(5) ; v_ptr 676%endif 677 678 movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 679 punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 680 punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 681 punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 682 punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 683 684%if %1 == 0 685 lea rdi, [rsi + rax - 4] ; rdi points to row +1 for indirect addressing 686 lea rsi, [rsi - 4] 687%endif 688 689 movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 690 punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 691 692 movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 693 punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 694 695 punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 696 697 punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 698 699 punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 700 701 movdqa [rsp+_t0], xmm2 ; save to free XMM2 702 703 movq xmm2, [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 704 movq xmm6, [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 705 movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 706 movq xmm5, [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 707 movq xmm1, [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 708 709 punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 710 711 movq xmm6, [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 712 713 punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 714 715 movq xmm5, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 716 717 punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 718 719 movq xmm6, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 720 721 punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 722 723 movdqa xmm6, xmm1 ; 724 punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 725 726 punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 727 movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 728 729 punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 730 731 punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 732 733 movdqa xmm0, xmm5 734 punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 735 736 punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 737 movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 738 739 punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 740 741 punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 742 movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 743 744 punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 745 746 punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 747 748%if %2 == 0 749 movdqa [rsp+_q3], xmm7 ; save 7 750 movdqa [rsp+_q2], xmm6 ; save 6 751%endif 752 movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 753 punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 754 punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 755 movdqa [rsp+_p1], xmm2 ; save 2 756 757 movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 758 punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 759 movdqa [rsp+_p0], xmm3 ; save 3 760 761 punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 762 763 movdqa [rsp+_q0], xmm4 ; save 4 764 movdqa [rsp+_q1], xmm5 ; save 5 765 movdqa xmm1, [rsp+_t0] 766 767 movdqa xmm2, xmm1 ; 768 punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 769 punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 770 771%if %2 == 0 772 movdqa [rsp+_p2], xmm1 773 movdqa [rsp+_p3], xmm2 774%endif 775 776%endmacro 777 778%macro LFV_FILTER_MASK_HEV_MASK 0 779 movdqa xmm0, xmm6 ; q2 780 psubusb xmm0, xmm7 ; q2-q3 781 782 psubusb xmm7, xmm6 ; q3-q2 783 movdqa xmm4, xmm5 ; q1 784 785 por xmm7, xmm0 ; abs (q3-q2) 786 psubusb xmm4, xmm6 ; q1-q2 787 788 movdqa xmm0, xmm1 789 psubusb xmm6, xmm5 ; q2-q1 790 791 por xmm6, xmm4 ; abs (q2-q1) 792 psubusb xmm0, xmm2 ; p2 - p3; 793 794 psubusb xmm2, xmm1 ; p3 - p2; 795 por xmm0, xmm2 ; abs(p2-p3) 796 797 movdqa xmm5, [rsp+_p1] ; p1 798 pmaxub xmm0, xmm7 799 800 movdqa xmm2, xmm5 ; p1 801 psubusb xmm5, xmm1 ; p1-p2 802 psubusb xmm1, xmm2 ; p2-p1 803 804 movdqa xmm7, xmm3 ; p0 805 psubusb xmm7, xmm2 ; p0-p1 806 807 por xmm1, xmm5 ; abs(p2-p1) 808 pmaxub xmm0, xmm6 809 810 pmaxub xmm0, xmm1 811 movdqa xmm1, xmm2 ; p1 812 813 psubusb xmm2, xmm3 ; p1-p0 814 815 por xmm2, xmm7 ; abs(p1-p0) 816 817 pmaxub xmm0, xmm2 818 819 movdqa xmm5, [rsp+_q0] ; q0 820 movdqa xmm7, [rsp+_q1] ; q1 821 822 mov rdx, arg(3) ; limit 823 824 movdqa xmm6, xmm5 ; q0 825 movdqa xmm4, xmm7 ; q1 826 827 psubusb xmm5, xmm7 ; q0-q1 828 psubusb xmm7, xmm6 ; q1-q0 829 830 por xmm7, xmm5 ; abs(q1-q0) 831 832 pmaxub xmm0, xmm7 833 834 psubusb xmm0, [rdx] ; limit 835 836 mov rdx, arg(2) ; blimit 837 movdqa xmm5, xmm4 ; q1 838 839 psubusb xmm5, xmm1 ; q1-=p1 840 psubusb xmm1, xmm4 ; p1-=q1 841 842 por xmm5, xmm1 ; abs(p1-q1) 843 movdqa xmm1, xmm3 ; p0 844 845 pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero 846 psubusb xmm1, xmm6 ; p0-q0 847 848 movdqa xmm4, [rdx] ; blimit 849 mov rdx, arg(4) ; get thresh 850 851 psrlw xmm5, 1 ; abs(p1-q1)/2 852 psubusb xmm6, xmm3 ; q0-p0 853 854 por xmm1, xmm6 ; abs(q0-p0) 855 paddusb xmm1, xmm1 ; abs(q0-p0)*2 856 movdqa xmm3, [rdx] 857 858 paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 859 psubusb xmm2, xmm3 ; abs(q1 - q0) > thresh 860 861 psubusb xmm7, xmm3 ; abs(p1 - p0)> thresh 862 863 psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit 864 por xmm2, xmm7 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 865 866 por xmm1, xmm0 ; mask 867 pcmpeqb xmm2, xmm0 868 869 pxor xmm0, xmm0 870 pcmpeqb xmm4, xmm4 871 872 pcmpeqb xmm1, xmm0 873 pxor xmm4, xmm2 874%endmacro 875 876%macro BV_TRANSPOSE 0 877 ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 878 ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 879 ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 880 ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 881 movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 882 punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 883 884 movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 885 punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 886 887 punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 888 889 punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 890 891 movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 892 punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 893 894 punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 895 movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 896 897 punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 898 899 punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 900 ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 901 ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 902 ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 903 ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 904%endmacro 905 906%macro BV_WRITEBACK 2 907 movd [rsi+2], %1 908 movd [rsi+4*rax+2], %2 909 psrldq %1, 4 910 psrldq %2, 4 911 movd [rdi+2], %1 912 movd [rdi+4*rax+2], %2 913 psrldq %1, 4 914 psrldq %2, 4 915 movd [rsi+2*rax+2], %1 916 movd [rsi+2*rcx+2], %2 917 psrldq %1, 4 918 psrldq %2, 4 919 movd [rdi+2*rax+2], %1 920 movd [rdi+2*rcx+2], %2 921%endmacro 922 923%if ABI_IS_32BIT 924 925;void vp8_loop_filter_vertical_edge_sse2 926;( 927; unsigned char *src_ptr, 928; int src_pixel_step, 929; const char *blimit, 930; const char *limit, 931; const char *thresh, 932;) 933global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE 934sym(vp8_loop_filter_vertical_edge_sse2): 935 push rbp 936 mov rbp, rsp 937 SHADOW_ARGS_TO_STACK 5 938 SAVE_XMM 7 939 GET_GOT rbx 940 push rsi 941 push rdi 942 ; end prolog 943 944 ALIGN_STACK 16, rax 945 sub rsp, lf_var_size 946 947 mov rsi, arg(0) ; src_ptr 948 movsxd rax, dword ptr arg(1) ; src_pixel_step 949 950 lea rsi, [rsi - 4] 951 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 952 lea rcx, [rax*2+rax] 953 954 ;transpose 16x8 to 8x16, and store the 8-line result on stack. 955 TRANSPOSE_16X8 1, 1 956 957 ; calculate filter mask and high edge variance 958 LFV_FILTER_MASK_HEV_MASK 959 960 ; start work on filters 961 B_FILTER 2 962 963 ; transpose and write back - only work on q1, q0, p0, p1 964 BV_TRANSPOSE 965 ; store 16-line result 966 967 lea rdx, [rax] 968 neg rdx 969 970 BV_WRITEBACK xmm1, xmm5 971 972 lea rsi, [rsi+rdx*8] 973 lea rdi, [rdi+rdx*8] 974 BV_WRITEBACK xmm2, xmm6 975 976 add rsp, lf_var_size 977 pop rsp 978 ; begin epilog 979 pop rdi 980 pop rsi 981 RESTORE_GOT 982 RESTORE_XMM 983 UNSHADOW_ARGS 984 pop rbp 985 ret 986 987%endif 988 989;void vp8_loop_filter_vertical_edge_uv_sse2 990;( 991; unsigned char *u, 992; int src_pixel_step, 993; const char *blimit, 994; const char *limit, 995; const char *thresh, 996; unsigned char *v 997;) 998global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE 999sym(vp8_loop_filter_vertical_edge_uv_sse2): 1000 push rbp 1001 mov rbp, rsp 1002 SHADOW_ARGS_TO_STACK 6 1003 SAVE_XMM 7 1004 GET_GOT rbx 1005 push rsi 1006 push rdi 1007 ; end prolog 1008 1009 ALIGN_STACK 16, rax 1010 sub rsp, lf_var_size 1011 1012 mov rsi, arg(0) ; u_ptr 1013 movsxd rax, dword ptr arg(1) ; src_pixel_step 1014 1015 lea rsi, [rsi - 4] 1016 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1017 lea rcx, [rax+2*rax] 1018 1019 ;transpose 16x8 to 8x16, and store the 8-line result on stack. 1020 TRANSPOSE_16X8 0, 1 1021 1022 ; calculate filter mask and high edge variance 1023 LFV_FILTER_MASK_HEV_MASK 1024 1025 ; start work on filters 1026 B_FILTER 2 1027 1028 ; transpose and write back - only work on q1, q0, p0, p1 1029 BV_TRANSPOSE 1030 1031 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1032 1033 ; store 16-line result 1034 BV_WRITEBACK xmm1, xmm5 1035 1036 mov rsi, arg(0) ; u_ptr 1037 lea rsi, [rsi - 4] 1038 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1039 BV_WRITEBACK xmm2, xmm6 1040 1041 add rsp, lf_var_size 1042 pop rsp 1043 ; begin epilog 1044 pop rdi 1045 pop rsi 1046 RESTORE_GOT 1047 RESTORE_XMM 1048 UNSHADOW_ARGS 1049 pop rbp 1050 ret 1051 1052%macro MBV_TRANSPOSE 0 1053 movdqa xmm0, [rsp+_p3] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1054 movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1055 1056 punpcklbw xmm0, xmm2 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 1057 punpckhbw xmm1, xmm2 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 1058 1059 movdqa xmm7, [rsp+_p1] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1060 movdqa xmm6, xmm7 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1061 1062 punpcklbw xmm7, [rsp+_p0] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 1063 punpckhbw xmm6, [rsp+_p0] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 1064 1065 movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 1066 punpcklwd xmm0, xmm7 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 1067 1068 punpckhwd xmm3, xmm7 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 1069 movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 1070 1071 punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 1072 punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 1073 1074 movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 1075 punpcklbw xmm7, [rsp+_q1] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 1076 1077 movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 1078 punpcklbw xmm6, [rsp+_q3] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06 1079 1080 movdqa xmm2, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 1081 punpcklwd xmm7, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04 1082 1083 punpckhwd xmm2, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44 1084 movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 1085 1086 punpckldq xmm0, xmm7 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00 1087 punpckhdq xmm6, xmm7 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20 1088%endmacro 1089 1090%macro MBV_WRITEBACK_1 0 1091 movq [rsi], xmm0 1092 movhps [rdi], xmm0 1093 1094 movq [rsi+2*rax], xmm6 1095 movhps [rdi+2*rax], xmm6 1096 1097 movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 1098 punpckldq xmm0, xmm2 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 1099 punpckhdq xmm3, xmm2 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 1100 1101 movq [rsi+4*rax], xmm0 1102 movhps [rdi+4*rax], xmm0 1103 1104 movq [rsi+2*rcx], xmm3 1105 movhps [rdi+2*rcx], xmm3 1106 1107 movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 1108 punpckhbw xmm7, [rsp+_q1] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 1109 punpckhbw xmm5, [rsp+_q3] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 1110 1111 movdqa xmm0, xmm7 1112 punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84 1113 punpckhwd xmm7, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 1114 1115 movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 1116 punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80 1117 punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0 1118%endmacro 1119 1120%macro MBV_WRITEBACK_2 0 1121 movq [rsi], xmm1 1122 movhps [rdi], xmm1 1123 1124 movq [rsi+2*rax], xmm5 1125 movhps [rdi+2*rax], xmm5 1126 1127 movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 1128 punpckldq xmm1, xmm7 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 1129 punpckhdq xmm4, xmm7 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 1130 1131 movq [rsi+4*rax], xmm1 1132 movhps [rdi+4*rax], xmm1 1133 1134 movq [rsi+2*rcx], xmm4 1135 movhps [rdi+2*rcx], xmm4 1136%endmacro 1137 1138 1139;void vp8_mbloop_filter_vertical_edge_sse2 1140;( 1141; unsigned char *src_ptr, 1142; int src_pixel_step, 1143; const char *blimit, 1144; const char *limit, 1145; const char *thresh, 1146;) 1147global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE 1148sym(vp8_mbloop_filter_vertical_edge_sse2): 1149 push rbp 1150 mov rbp, rsp 1151 SHADOW_ARGS_TO_STACK 5 1152 SAVE_XMM 7 1153 GET_GOT rbx 1154 push rsi 1155 push rdi 1156 ; end prolog 1157 1158 ALIGN_STACK 16, rax 1159 sub rsp, lf_var_size 1160 1161 mov rsi, arg(0) ; src_ptr 1162 movsxd rax, dword ptr arg(1) ; src_pixel_step 1163 1164 lea rsi, [rsi - 4] 1165 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1166 lea rcx, [rax*2+rax] 1167 1168 ; Transpose 1169 TRANSPOSE_16X8 1, 0 1170 1171 ; calculate filter mask and high edge variance 1172 LFV_FILTER_MASK_HEV_MASK 1173 1174 neg rax 1175 ; start work on filters 1176 MB_FILTER_AND_WRITEBACK 2 1177 1178 lea rsi, [rsi+rax*8] 1179 lea rdi, [rdi+rax*8] 1180 1181 ; transpose and write back 1182 MBV_TRANSPOSE 1183 1184 neg rax 1185 1186 MBV_WRITEBACK_1 1187 1188 1189 lea rsi, [rsi+rax*8] 1190 lea rdi, [rdi+rax*8] 1191 MBV_WRITEBACK_2 1192 1193 add rsp, lf_var_size 1194 pop rsp 1195 ; begin epilog 1196 pop rdi 1197 pop rsi 1198 RESTORE_GOT 1199 RESTORE_XMM 1200 UNSHADOW_ARGS 1201 pop rbp 1202 ret 1203 1204 1205;void vp8_mbloop_filter_vertical_edge_uv_sse2 1206;( 1207; unsigned char *u, 1208; int src_pixel_step, 1209; const char *blimit, 1210; const char *limit, 1211; const char *thresh, 1212; unsigned char *v 1213;) 1214global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE 1215sym(vp8_mbloop_filter_vertical_edge_uv_sse2): 1216 push rbp 1217 mov rbp, rsp 1218 SHADOW_ARGS_TO_STACK 6 1219 SAVE_XMM 7 1220 GET_GOT rbx 1221 push rsi 1222 push rdi 1223 ; end prolog 1224 1225 ALIGN_STACK 16, rax 1226 sub rsp, lf_var_size 1227 1228 mov rsi, arg(0) ; u_ptr 1229 movsxd rax, dword ptr arg(1) ; src_pixel_step 1230 1231 lea rsi, [rsi - 4] 1232 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1233 lea rcx, [rax+2*rax] 1234 1235 ; Transpose 1236 TRANSPOSE_16X8 0, 0 1237 1238 ; calculate filter mask and high edge variance 1239 LFV_FILTER_MASK_HEV_MASK 1240 1241 ; start work on filters 1242 MB_FILTER_AND_WRITEBACK 2 1243 1244 ; transpose and write back 1245 MBV_TRANSPOSE 1246 1247 mov rsi, arg(0) ;u_ptr 1248 lea rsi, [rsi - 4] 1249 lea rdi, [rsi + rax] 1250 MBV_WRITEBACK_1 1251 mov rsi, arg(5) ;v_ptr 1252 lea rsi, [rsi - 4] 1253 lea rdi, [rsi + rax] 1254 MBV_WRITEBACK_2 1255 1256 add rsp, lf_var_size 1257 pop rsp 1258 ; begin epilog 1259 pop rdi 1260 pop rsi 1261 RESTORE_GOT 1262 RESTORE_XMM 1263 UNSHADOW_ARGS 1264 pop rbp 1265 ret 1266 1267 1268;void vp8_loop_filter_simple_horizontal_edge_sse2 1269;( 1270; unsigned char *src_ptr, 1271; int src_pixel_step, 1272; const char *blimit, 1273;) 1274global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE 1275sym(vp8_loop_filter_simple_horizontal_edge_sse2): 1276 push rbp 1277 mov rbp, rsp 1278 SHADOW_ARGS_TO_STACK 3 1279 SAVE_XMM 7 1280 GET_GOT rbx 1281 ; end prolog 1282 1283 mov rcx, arg(0) ;src_ptr 1284 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 1285 movdqa xmm6, [GLOBAL(tfe)] 1286 lea rdx, [rcx + rax] 1287 neg rax 1288 1289 ; calculate mask 1290 movdqa xmm0, [rdx] ; q1 1291 mov rdx, arg(2) ;blimit 1292 movdqa xmm1, [rcx+2*rax] ; p1 1293 1294 movdqa xmm2, xmm1 1295 movdqa xmm3, xmm0 1296 1297 psubusb xmm0, xmm1 ; q1-=p1 1298 psubusb xmm1, xmm3 ; p1-=q1 1299 por xmm1, xmm0 ; abs(p1-q1) 1300 pand xmm1, xmm6 ; set lsb of each byte to zero 1301 psrlw xmm1, 1 ; abs(p1-q1)/2 1302 1303 movdqa xmm7, XMMWORD PTR [rdx] 1304 1305 movdqa xmm5, [rcx+rax] ; p0 1306 movdqa xmm4, [rcx] ; q0 1307 movdqa xmm0, xmm4 ; q0 1308 movdqa xmm6, xmm5 ; p0 1309 psubusb xmm5, xmm4 ; p0-=q0 1310 psubusb xmm4, xmm6 ; q0-=p0 1311 por xmm5, xmm4 ; abs(p0 - q0) 1312 1313 movdqa xmm4, [GLOBAL(t80)] 1314 1315 paddusb xmm5, xmm5 ; abs(p0-q0)*2 1316 paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 1317 psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit 1318 pxor xmm7, xmm7 1319 pcmpeqb xmm5, xmm7 1320 1321 1322 ; start work on filters 1323 pxor xmm2, xmm4 ; p1 offset to convert to signed values 1324 pxor xmm3, xmm4 ; q1 offset to convert to signed values 1325 psubsb xmm2, xmm3 ; p1 - q1 1326 1327 pxor xmm6, xmm4 ; offset to convert to signed values 1328 pxor xmm0, xmm4 ; offset to convert to signed values 1329 movdqa xmm3, xmm0 ; q0 1330 psubsb xmm0, xmm6 ; q0 - p0 1331 paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) 1332 paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) 1333 paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) 1334 pand xmm5, xmm2 ; mask filter values we don't care about 1335 1336 movdqa xmm0, xmm5 1337 paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 1338 paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 1339 1340 movdqa xmm1, [GLOBAL(te0)] 1341 movdqa xmm2, [GLOBAL(t1f)] 1342 1343; pxor xmm7, xmm7 1344 pcmpgtb xmm7, xmm0 ;save sign 1345 pand xmm7, xmm1 ;preserve the upper 3 bits 1346 psrlw xmm0, 3 1347 pand xmm0, xmm2 ;clear out upper 3 bits 1348 por xmm0, xmm7 ;add sign 1349 psubsb xmm3, xmm0 ; q0-= q0sz add 1350 1351 pxor xmm7, xmm7 1352 pcmpgtb xmm7, xmm5 ;save sign 1353 pand xmm7, xmm1 ;preserve the upper 3 bits 1354 psrlw xmm5, 3 1355 pand xmm5, xmm2 ;clear out upper 3 bits 1356 por xmm5, xmm7 ;add sign 1357 paddsb xmm6, xmm5 ; p0+= p0 add 1358 1359 pxor xmm3, xmm4 ; unoffset 1360 movdqa [rcx], xmm3 ; write back 1361 1362 pxor xmm6, xmm4 ; unoffset 1363 movdqa [rcx+rax], xmm6 ; write back 1364 1365 ; begin epilog 1366 RESTORE_GOT 1367 RESTORE_XMM 1368 UNSHADOW_ARGS 1369 pop rbp 1370 ret 1371 1372 1373;void vp8_loop_filter_simple_vertical_edge_sse2 1374;( 1375; unsigned char *src_ptr, 1376; int src_pixel_step, 1377; const char *blimit, 1378;) 1379global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE 1380sym(vp8_loop_filter_simple_vertical_edge_sse2): 1381 push rbp ; save old base pointer value. 1382 mov rbp, rsp ; set new base pointer value. 1383 SHADOW_ARGS_TO_STACK 3 1384 SAVE_XMM 7 1385 GET_GOT rbx ; save callee-saved reg 1386 push rsi 1387 push rdi 1388 ; end prolog 1389 1390 ALIGN_STACK 16, rax 1391 sub rsp, 32 ; reserve 32 bytes 1392 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1393 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1394 1395 mov rsi, arg(0) ;src_ptr 1396 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 1397 1398 lea rsi, [rsi - 2 ] 1399 lea rdi, [rsi + rax] 1400 lea rdx, [rsi + rax*4] 1401 lea rcx, [rdx + rax] 1402 1403 movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 1404 movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 1405 movd xmm2, [rdi] ; 13 12 11 10 1406 movd xmm3, [rcx] ; 53 52 51 50 1407 punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 1408 punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 1409 1410 movd xmm4, [rsi + rax*2] ; 23 22 21 20 1411 movd xmm5, [rdx + rax*2] ; 63 62 61 60 1412 movd xmm6, [rdi + rax*2] ; 33 32 31 30 1413 movd xmm7, [rcx + rax*2] ; 73 72 71 70 1414 punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 1415 punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 1416 1417 punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 1418 punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 1419 1420 movdqa xmm1, xmm0 1421 punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 1422 punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 1423 1424 movdqa xmm2, xmm0 1425 punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 1426 punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 1427 1428 lea rsi, [rsi + rax*8] 1429 lea rdi, [rsi + rax] 1430 lea rdx, [rsi + rax*4] 1431 lea rcx, [rdx + rax] 1432 1433 movd xmm4, [rsi] ; 83 82 81 80 1434 movd xmm1, [rdx] ; c3 c2 c1 c0 1435 movd xmm6, [rdi] ; 93 92 91 90 1436 movd xmm3, [rcx] ; d3 d2 d1 d0 1437 punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 1438 punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 1439 1440 movd xmm1, [rsi + rax*2] ; a3 a2 a1 a0 1441 movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0 1442 movd xmm3, [rdi + rax*2] ; b3 b2 b1 b0 1443 movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0 1444 punpckldq xmm1, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 1445 punpckldq xmm3, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 1446 1447 punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 1448 punpcklbw xmm1, xmm3 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 1449 1450 movdqa xmm7, xmm4 1451 punpcklwd xmm4, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 1452 punpckhwd xmm7, xmm1 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 1453 1454 movdqa xmm6, xmm4 1455 punpckldq xmm4, xmm7 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 1456 punpckhdq xmm6, xmm7 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 1457 1458 movdqa xmm1, xmm0 1459 movdqa xmm3, xmm2 1460 1461 punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1462 punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 1463 punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1464 punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 1465 1466 mov rdx, arg(2) ;blimit 1467 1468 ; calculate mask 1469 movdqa xmm6, xmm0 ; p1 1470 movdqa xmm7, xmm3 ; q1 1471 psubusb xmm7, xmm0 ; q1-=p1 1472 psubusb xmm6, xmm3 ; p1-=q1 1473 por xmm6, xmm7 ; abs(p1-q1) 1474 pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero 1475 psrlw xmm6, 1 ; abs(p1-q1)/2 1476 1477 movdqa xmm7, [rdx] 1478 1479 movdqa xmm5, xmm1 ; p0 1480 movdqa xmm4, xmm2 ; q0 1481 psubusb xmm5, xmm2 ; p0-=q0 1482 psubusb xmm4, xmm1 ; q0-=p0 1483 por xmm5, xmm4 ; abs(p0 - q0) 1484 paddusb xmm5, xmm5 ; abs(p0-q0)*2 1485 paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 1486 1487 movdqa xmm4, [GLOBAL(t80)] 1488 1489 psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit 1490 pxor xmm7, xmm7 1491 pcmpeqb xmm5, xmm7 ; mm5 = mask 1492 1493 ; start work on filters 1494 movdqa t0, xmm0 1495 movdqa t1, xmm3 1496 1497 pxor xmm0, xmm4 ; p1 offset to convert to signed values 1498 pxor xmm3, xmm4 ; q1 offset to convert to signed values 1499 psubsb xmm0, xmm3 ; p1 - q1 1500 1501 pxor xmm1, xmm4 ; offset to convert to signed values 1502 pxor xmm2, xmm4 ; offset to convert to signed values 1503 1504 movdqa xmm3, xmm2 ; offseted ; q0 1505 psubsb xmm2, xmm1 ; q0 - p0 1506 paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0) 1507 paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0) 1508 paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0) 1509 pand xmm5, xmm0 ; mask filter values we don't care about 1510 1511 movdqa xmm0, xmm5 1512 paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 1513 paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 1514 1515 movdqa xmm6, [GLOBAL(te0)] 1516 movdqa xmm2, [GLOBAL(t1f)] 1517 1518; pxor xmm7, xmm7 1519 pcmpgtb xmm7, xmm0 ;save sign 1520 pand xmm7, xmm6 ;preserve the upper 3 bits 1521 psrlw xmm0, 3 1522 pand xmm0, xmm2 ;clear out upper 3 bits 1523 por xmm0, xmm7 ;add sign 1524 psubsb xmm3, xmm0 ; q0-= q0sz add 1525 1526 pxor xmm7, xmm7 1527 pcmpgtb xmm7, xmm5 ;save sign 1528 pand xmm7, xmm6 ;preserve the upper 3 bits 1529 psrlw xmm5, 3 1530 pand xmm5, xmm2 ;clear out upper 3 bits 1531 por xmm5, xmm7 ;add sign 1532 paddsb xmm1, xmm5 ; p0+= p0 add 1533 1534 pxor xmm3, xmm4 ; unoffset q0 1535 pxor xmm1, xmm4 ; unoffset p0 1536 1537 movdqa xmm0, t0 ; p1 1538 movdqa xmm4, t1 ; q1 1539 1540 ; write out order: xmm0 xmm2 xmm1 xmm3 1541 lea rdx, [rsi + rax*4] 1542 1543 ; transpose back to write out 1544 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1545 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 1546 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1547 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 1548 movdqa xmm6, xmm0 1549 punpcklbw xmm0, xmm1 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 1550 punpckhbw xmm6, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 1551 1552 movdqa xmm5, xmm3 1553 punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 1554 punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 1555 1556 movdqa xmm2, xmm0 1557 punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 1558 punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 1559 1560 movdqa xmm3, xmm6 1561 punpcklwd xmm6, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 1562 punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 1563 1564 movd [rsi], xmm6 ; write the second 8-line result 1565 movd [rdx], xmm3 1566 psrldq xmm6, 4 1567 psrldq xmm3, 4 1568 movd [rdi], xmm6 1569 movd [rcx], xmm3 1570 psrldq xmm6, 4 1571 psrldq xmm3, 4 1572 movd [rsi + rax*2], xmm6 1573 movd [rdx + rax*2], xmm3 1574 psrldq xmm6, 4 1575 psrldq xmm3, 4 1576 movd [rdi + rax*2], xmm6 1577 movd [rcx + rax*2], xmm3 1578 1579 neg rax 1580 lea rsi, [rsi + rax*8] 1581 neg rax 1582 lea rdi, [rsi + rax] 1583 lea rdx, [rsi + rax*4] 1584 lea rcx, [rdx + rax] 1585 1586 movd [rsi], xmm0 ; write the first 8-line result 1587 movd [rdx], xmm2 1588 psrldq xmm0, 4 1589 psrldq xmm2, 4 1590 movd [rdi], xmm0 1591 movd [rcx], xmm2 1592 psrldq xmm0, 4 1593 psrldq xmm2, 4 1594 movd [rsi + rax*2], xmm0 1595 movd [rdx + rax*2], xmm2 1596 psrldq xmm0, 4 1597 psrldq xmm2, 4 1598 movd [rdi + rax*2], xmm0 1599 movd [rcx + rax*2], xmm2 1600 1601 add rsp, 32 1602 pop rsp 1603 ; begin epilog 1604 pop rdi 1605 pop rsi 1606 RESTORE_GOT 1607 RESTORE_XMM 1608 UNSHADOW_ARGS 1609 pop rbp 1610 ret 1611 1612SECTION_RODATA 1613align 16 1614tfe: 1615 times 16 db 0xfe 1616align 16 1617t80: 1618 times 16 db 0x80 1619align 16 1620t1s: 1621 times 16 db 0x01 1622align 16 1623t3: 1624 times 16 db 0x03 1625align 16 1626t4: 1627 times 16 db 0x04 1628align 16 1629ones: 1630 times 8 dw 0x0001 1631align 16 1632s9: 1633 times 8 dw 0x0900 1634align 16 1635s63: 1636 times 8 dw 0x003f 1637align 16 1638te0: 1639 times 16 db 0xe0 1640align 16 1641t1f: 1642 times 16 db 0x1f 1643