1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%define VP8_FILTER_WEIGHT 128 15%define VP8_FILTER_SHIFT 7 16 17;void vp8_post_proc_down_and_across_mmx 18;( 19; unsigned char *src_ptr, 20; unsigned char *dst_ptr, 21; int src_pixels_per_line, 22; int dst_pixels_per_line, 23; int rows, 24; int cols, 25; int flimit 26;) 27global sym(vp8_post_proc_down_and_across_mmx) 28sym(vp8_post_proc_down_and_across_mmx): 29 push rbp 30 mov rbp, rsp 31 SHADOW_ARGS_TO_STACK 7 32 GET_GOT rbx 33 push rsi 34 push rdi 35 ; end prolog 36 37%if ABI_IS_32BIT=1 && CONFIG_PIC=1 38 ; move the global rd onto the stack, since we don't have enough registers 39 ; to do PIC addressing 40 movq mm0, [GLOBAL(rd)] 41 sub rsp, 8 42 movq [rsp], mm0 43%define RD [rsp] 44%else 45%define RD [GLOBAL(rd)] 46%endif 47 48 push rbx 49 lea rbx, [GLOBAL(Blur)] 50 movd mm2, dword ptr arg(6) ;flimit 51 punpcklwd mm2, mm2 52 punpckldq mm2, mm2 53 54 mov rsi, arg(0) ;src_ptr 55 mov rdi, arg(1) ;dst_ptr 56 57 movsxd rcx, DWORD PTR arg(4) ;rows 58 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? 59 pxor mm0, mm0 ; mm0 = 00000000 60 61nextrow: 62 63 xor rdx, rdx ; clear out rdx for use as loop counter 64nextcol: 65 66 pxor mm7, mm7 ; mm7 = 00000000 67 movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps 68 movq mm3, [rsi] ; mm4 = r0 p0..p7 69 punpcklbw mm3, mm0 ; mm3 = p0..p3 70 movq mm1, mm3 ; mm1 = p0..p3 71 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers 72 73 movq mm6, [rbx + 48] ; mm6 = kernel 3 taps 74 movq mm5, [rsi + rax] ; mm4 = r1 p0..p7 75 punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 76 pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers 77 paddusw mm3, mm6 ; mm3 += mm6 78 79 ; thresholding 80 movq mm7, mm1 ; mm7 = r0 p0..p3 81 psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 82 psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 83 paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) 84 pcmpgtw mm7, mm2 85 86 movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers 87 movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7 88 punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 89 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers 90 paddusw mm3, mm6 ; mm3 += mm5 91 92 ; thresholding 93 movq mm6, mm1 ; mm6 = r0 p0..p3 94 psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 95 psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 96 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) 97 pcmpgtw mm6, mm2 98 por mm7, mm6 ; accumulate thresholds 99 100 101 neg rax 102 movq mm6, [rbx ] ; kernel 0 taps 103 movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7 104 punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 105 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers 106 paddusw mm3, mm6 ; mm3 += mm5 107 108 ; thresholding 109 movq mm6, mm1 ; mm6 = r0 p0..p3 110 psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 111 psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 112 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) 113 pcmpgtw mm6, mm2 114 por mm7, mm6 ; accumulate thresholds 115 116 movq mm6, [rbx + 16] ; kernel 1 taps 117 movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7 118 punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 119 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. 120 paddusw mm3, mm6 ; mm3 += mm5 121 122 ; thresholding 123 movq mm6, mm1 ; mm6 = r0 p0..p3 124 psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 125 psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 126 paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) 127 pcmpgtw mm6, mm2 128 por mm7, mm6 ; accumulate thresholds 129 130 131 paddusw mm3, RD ; mm3 += round value 132 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 133 134 pand mm1, mm7 ; mm1 select vals > thresh from source 135 pandn mm7, mm3 ; mm7 select vals < thresh from blurred result 136 paddusw mm1, mm7 ; combination 137 138 packuswb mm1, mm0 ; pack to bytes 139 140 movd [rdi], mm1 ; 141 neg rax ; pitch is positive 142 143 144 add rsi, 4 145 add rdi, 4 146 add rdx, 4 147 148 cmp edx, dword ptr arg(5) ;cols 149 jl nextcol 150 ; done with the all cols, start the across filtering in place 151 sub rsi, rdx 152 sub rdi, rdx 153 154 155 push rax 156 xor rdx, rdx 157 mov rax, [rdi-4]; 158 159acrossnextcol: 160 pxor mm7, mm7 ; mm7 = 00000000 161 movq mm6, [rbx + 32 ] ; 162 movq mm4, [rdi+rdx] ; mm4 = p0..p7 163 movq mm3, mm4 ; mm3 = p0..p7 164 punpcklbw mm3, mm0 ; mm3 = p0..p3 165 movq mm1, mm3 ; mm1 = p0..p3 166 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers 167 168 movq mm6, [rbx + 48] 169 psrlq mm4, 8 ; mm4 = p1..p7 170 movq mm5, mm4 ; mm5 = p1..p7 171 punpcklbw mm5, mm0 ; mm5 = p1..p4 172 pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers 173 paddusw mm3, mm6 ; mm3 += mm6 174 175 ; thresholding 176 movq mm7, mm1 ; mm7 = p0..p3 177 psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 178 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 179 paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) 180 pcmpgtw mm7, mm2 181 182 movq mm6, [rbx + 64 ] 183 psrlq mm4, 8 ; mm4 = p2..p7 184 movq mm5, mm4 ; mm5 = p2..p7 185 punpcklbw mm5, mm0 ; mm5 = p2..p5 186 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers 187 paddusw mm3, mm6 ; mm3 += mm5 188 189 ; thresholding 190 movq mm6, mm1 ; mm6 = p0..p3 191 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 192 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 193 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) 194 pcmpgtw mm6, mm2 195 por mm7, mm6 ; accumulate thresholds 196 197 198 movq mm6, [rbx ] 199 movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5 200 movq mm5, mm4 ; mm5 = p-2..p5 201 punpcklbw mm5, mm0 ; mm5 = p-2..p1 202 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers 203 paddusw mm3, mm6 ; mm3 += mm5 204 205 ; thresholding 206 movq mm6, mm1 ; mm6 = p0..p3 207 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 208 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 209 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) 210 pcmpgtw mm6, mm2 211 por mm7, mm6 ; accumulate thresholds 212 213 movq mm6, [rbx + 16] 214 psrlq mm4, 8 ; mm4 = p-1..p5 215 punpcklbw mm4, mm0 ; mm4 = p-1..p2 216 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. 217 paddusw mm3, mm6 ; mm3 += mm5 218 219 ; thresholding 220 movq mm6, mm1 ; mm6 = p0..p3 221 psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 222 psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 223 paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) 224 pcmpgtw mm6, mm2 225 por mm7, mm6 ; accumulate thresholds 226 227 paddusw mm3, RD ; mm3 += round value 228 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 229 230 pand mm1, mm7 ; mm1 select vals > thresh from source 231 pandn mm7, mm3 ; mm7 select vals < thresh from blurred result 232 paddusw mm1, mm7 ; combination 233 234 packuswb mm1, mm0 ; pack to bytes 235 mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes 236 movd eax, mm1 237 238 add rdx, 4 239 cmp edx, dword ptr arg(5) ;cols 240 jl acrossnextcol; 241 242 mov DWORD PTR [rdi+rdx-4], eax 243 pop rax 244 245 ; done with this rwo 246 add rsi,rax ; next line 247 movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch? 248 add rdi,rax ; next destination 249 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch? 250 251 dec rcx ; decrement count 252 jnz nextrow ; next row 253 pop rbx 254 255 ; begin epilog 256 pop rdi 257 pop rsi 258 RESTORE_GOT 259 UNSHADOW_ARGS 260 pop rbp 261 ret 262%undef RD 263 264 265;void vp8_mbpost_proc_down_mmx(unsigned char *dst, 266; int pitch, int rows, int cols,int flimit) 267extern sym(vp8_rv) 268global sym(vp8_mbpost_proc_down_mmx) 269sym(vp8_mbpost_proc_down_mmx): 270 push rbp 271 mov rbp, rsp 272 SHADOW_ARGS_TO_STACK 5 273 GET_GOT rbx 274 push rsi 275 push rdi 276 ; end prolog 277 278 ALIGN_STACK 16, rax 279 sub rsp, 136 280 281 ; unsigned char d[16][8] at [rsp] 282 ; create flimit2 at [rsp+128] 283 mov eax, dword ptr arg(4) ;flimit 284 mov [rsp+128], eax 285 mov [rsp+128+4], eax 286%define flimit2 [rsp+128] 287 288%if ABI_IS_32BIT=0 289 lea r8, [GLOBAL(sym(vp8_rv))] 290%endif 291 292 ;rows +=8; 293 add dword ptr arg(2), 8 294 295 ;for(c=0; c<cols; c+=4) 296loop_col: 297 mov rsi, arg(0) ;s 298 pxor mm0, mm0 ; 299 300 movsxd rax, dword ptr arg(1) ;pitch ; 301 neg rax ; rax = -pitch 302 303 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] 304 neg rax 305 306 307 pxor mm5, mm5 308 pxor mm6, mm6 ; 309 310 pxor mm7, mm7 ; 311 mov rdi, rsi 312 313 mov rcx, 15 ; 314 315loop_initvar: 316 movd mm1, DWORD PTR [rdi]; 317 punpcklbw mm1, mm0 ; 318 319 paddw mm5, mm1 ; 320 pmullw mm1, mm1 ; 321 322 movq mm2, mm1 ; 323 punpcklwd mm1, mm0 ; 324 325 punpckhwd mm2, mm0 ; 326 paddd mm6, mm1 ; 327 328 paddd mm7, mm2 ; 329 lea rdi, [rdi+rax] ; 330 331 dec rcx 332 jne loop_initvar 333 ;save the var and sum 334 xor rdx, rdx 335loop_row: 336 movd mm1, DWORD PTR [rsi] ; [s-pitch*8] 337 movd mm2, DWORD PTR [rdi] ; [s+pitch*7] 338 339 punpcklbw mm1, mm0 340 punpcklbw mm2, mm0 341 342 paddw mm5, mm2 343 psubw mm5, mm1 344 345 pmullw mm2, mm2 346 movq mm4, mm2 347 348 punpcklwd mm2, mm0 349 punpckhwd mm4, mm0 350 351 paddd mm6, mm2 352 paddd mm7, mm4 353 354 pmullw mm1, mm1 355 movq mm2, mm1 356 357 punpcklwd mm1, mm0 358 psubd mm6, mm1 359 360 punpckhwd mm2, mm0 361 psubd mm7, mm2 362 363 364 movq mm3, mm6 365 pslld mm3, 4 366 367 psubd mm3, mm6 368 movq mm1, mm5 369 370 movq mm4, mm5 371 pmullw mm1, mm1 372 373 pmulhw mm4, mm4 374 movq mm2, mm1 375 376 punpcklwd mm1, mm4 377 punpckhwd mm2, mm4 378 379 movq mm4, mm7 380 pslld mm4, 4 381 382 psubd mm4, mm7 383 384 psubd mm3, mm1 385 psubd mm4, mm2 386 387 psubd mm3, flimit2 388 psubd mm4, flimit2 389 390 psrad mm3, 31 391 psrad mm4, 31 392 393 packssdw mm3, mm4 394 packsswb mm3, mm0 395 396 movd mm1, DWORD PTR [rsi+rax*8] 397 398 movq mm2, mm1 399 punpcklbw mm1, mm0 400 401 paddw mm1, mm5 402 mov rcx, rdx 403 404 and rcx, 127 405%if ABI_IS_32BIT=1 && CONFIG_PIC=1 406 push rax 407 lea rax, [GLOBAL(sym(vp8_rv))] 408 movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2] 409 pop rax 410%elif ABI_IS_32BIT=0 411 movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2] 412%else 413 movq mm4, [sym(vp8_rv) + rcx*2] 414%endif 415 paddw mm1, mm4 416 ;paddw xmm1, eight8s 417 psraw mm1, 4 418 419 packuswb mm1, mm0 420 pand mm1, mm3 421 422 pandn mm3, mm2 423 por mm1, mm3 424 425 and rcx, 15 426 movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] 427 428 mov rcx, rdx 429 sub rcx, 8 430 431 and rcx, 15 432 movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] 433 434 movd [rsi], mm1 435 lea rsi, [rsi+rax] 436 437 lea rdi, [rdi+rax] 438 add rdx, 1 439 440 cmp edx, dword arg(2) ;rows 441 jl loop_row 442 443 444 add dword arg(0), 4 ; s += 4 445 sub dword arg(3), 4 ; cols -= 4 446 cmp dword arg(3), 0 447 jg loop_col 448 449 add rsp, 136 450 pop rsp 451 452 ; begin epilog 453 pop rdi 454 pop rsi 455 RESTORE_GOT 456 UNSHADOW_ARGS 457 pop rbp 458 ret 459%undef flimit2 460 461 462;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise, 463; unsigned char blackclamp[16], 464; unsigned char whiteclamp[16], 465; unsigned char bothclamp[16], 466; unsigned int Width, unsigned int Height, int Pitch) 467extern sym(rand) 468global sym(vp8_plane_add_noise_mmx) 469sym(vp8_plane_add_noise_mmx): 470 push rbp 471 mov rbp, rsp 472 SHADOW_ARGS_TO_STACK 8 473 GET_GOT rbx 474 push rsi 475 push rdi 476 ; end prolog 477 478addnoise_loop: 479 call sym(rand) WRT_PLT 480 mov rcx, arg(1) ;noise 481 and rax, 0xff 482 add rcx, rax 483 484 ; we rely on the fact that the clamping vectors are stored contiguously 485 ; in black/white/both order. Note that we have to reload this here because 486 ; rdx could be trashed by rand() 487 mov rdx, arg(2) ; blackclamp 488 489 490 mov rdi, rcx 491 movsxd rcx, dword arg(5) ;[Width] 492 mov rsi, arg(0) ;Pos 493 xor rax,rax 494 495addnoise_nextset: 496 movq mm1,[rsi+rax] ; get the source 497 498 psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise 499 paddusb mm1, [rdx+32] ;bothclamp 500 psubusb mm1, [rdx+16] ;whiteclamp 501 502 movq mm2,[rdi+rax] ; get the noise for this line 503 paddb mm1,mm2 ; add it in 504 movq [rsi+rax],mm1 ; store the result 505 506 add rax,8 ; move to the next line 507 508 cmp rax, rcx 509 jl addnoise_nextset 510 511 movsxd rax, dword arg(7) ; Pitch 512 add arg(0), rax ; Start += Pitch 513 sub dword arg(6), 1 ; Height -= 1 514 jg addnoise_loop 515 516 ; begin epilog 517 pop rdi 518 pop rsi 519 RESTORE_GOT 520 UNSHADOW_ARGS 521 pop rbp 522 ret 523 524 525SECTION_RODATA 526align 16 527Blur: 528 times 16 dw 16 529 times 8 dw 64 530 times 16 dw 16 531 times 8 dw 0 532 533rd: 534 times 4 dw 0x40 535