1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "vpx_ports/x86_abi_support.asm" 12 13%macro GET_PARAM_4 0 14 mov rdx, arg(5) ;filter ptr 15 mov rsi, arg(0) ;src_ptr 16 mov rdi, arg(2) ;output_ptr 17 mov rcx, 0x0400040 18 19 movdqa xmm3, [rdx] ;load filters 20 pshuflw xmm4, xmm3, 11111111b ;k3 21 psrldq xmm3, 8 22 pshuflw xmm3, xmm3, 0b ;k4 23 punpcklqdq xmm4, xmm3 ;k3k4 24 25 movq xmm3, rcx ;rounding 26 pshufd xmm3, xmm3, 0 27 28 pxor xmm2, xmm2 29 30 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 31 movsxd rdx, DWORD PTR arg(3) ;out_pitch 32 movsxd rcx, DWORD PTR arg(4) ;output_height 33%endm 34 35%macro APPLY_FILTER_4 1 36 37 punpckldq xmm0, xmm1 ;two row in one register 38 punpcklbw xmm0, xmm2 ;unpack to word 39 pmullw xmm0, xmm4 ;multiply the filter factors 40 41 movdqa xmm1, xmm0 42 psrldq xmm1, 8 43 paddsw xmm0, xmm1 44 45 paddsw xmm0, xmm3 ;rounding 46 psraw xmm0, 7 ;shift 47 packuswb xmm0, xmm0 ;pack to byte 48 49%if %1 50 movd xmm1, [rdi] 51 pavgb xmm0, xmm1 52%endif 53 54 movd [rdi], xmm0 55 lea rsi, [rsi + rax] 56 lea rdi, [rdi + rdx] 57 dec rcx 58%endm 59 60%macro GET_PARAM 0 61 mov rdx, arg(5) ;filter ptr 62 mov rsi, arg(0) ;src_ptr 63 mov rdi, arg(2) ;output_ptr 64 mov rcx, 0x0400040 65 66 movdqa xmm7, [rdx] ;load filters 67 68 pshuflw xmm6, xmm7, 11111111b ;k3 69 pshufhw xmm7, xmm7, 0b ;k4 70 punpcklwd xmm6, xmm6 71 punpckhwd xmm7, xmm7 72 73 movq xmm4, rcx ;rounding 74 pshufd xmm4, xmm4, 0 75 76 pxor xmm5, xmm5 77 78 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 79 movsxd rdx, DWORD PTR arg(3) ;out_pitch 80 movsxd rcx, DWORD PTR arg(4) ;output_height 81%endm 82 83%macro APPLY_FILTER_8 1 84 punpcklbw xmm0, xmm5 85 punpcklbw xmm1, xmm5 86 87 pmullw xmm0, xmm6 88 pmullw xmm1, xmm7 89 paddsw xmm0, xmm1 90 paddsw xmm0, xmm4 ;rounding 91 psraw xmm0, 7 ;shift 92 packuswb xmm0, xmm0 ;pack back to byte 93%if %1 94 movq xmm1, [rdi] 95 pavgb xmm0, xmm1 96%endif 97 movq [rdi], xmm0 ;store the result 98 99 lea rsi, [rsi + rax] 100 lea rdi, [rdi + rdx] 101 dec rcx 102%endm 103 104%macro APPLY_FILTER_16 1 105 punpcklbw xmm0, xmm5 106 punpcklbw xmm1, xmm5 107 punpckhbw xmm2, xmm5 108 punpckhbw xmm3, xmm5 109 110 pmullw xmm0, xmm6 111 pmullw xmm1, xmm7 112 pmullw xmm2, xmm6 113 pmullw xmm3, xmm7 114 115 paddsw xmm0, xmm1 116 paddsw xmm2, xmm3 117 118 paddsw xmm0, xmm4 ;rounding 119 paddsw xmm2, xmm4 120 psraw xmm0, 7 ;shift 121 psraw xmm2, 7 122 packuswb xmm0, xmm2 ;pack back to byte 123%if %1 124 movdqu xmm1, [rdi] 125 pavgb xmm0, xmm1 126%endif 127 movdqu [rdi], xmm0 ;store the result 128 129 lea rsi, [rsi + rax] 130 lea rdi, [rdi + rdx] 131 dec rcx 132%endm 133 134global sym(vpx_filter_block1d4_v2_sse2) PRIVATE 135sym(vpx_filter_block1d4_v2_sse2): 136 push rbp 137 mov rbp, rsp 138 SHADOW_ARGS_TO_STACK 6 139 push rsi 140 push rdi 141 ; end prolog 142 143 GET_PARAM_4 144.loop: 145 movd xmm0, [rsi] ;load src 146 movd xmm1, [rsi + rax] 147 148 APPLY_FILTER_4 0 149 jnz .loop 150 151 ; begin epilog 152 pop rdi 153 pop rsi 154 UNSHADOW_ARGS 155 pop rbp 156 ret 157 158global sym(vpx_filter_block1d8_v2_sse2) PRIVATE 159sym(vpx_filter_block1d8_v2_sse2): 160 push rbp 161 mov rbp, rsp 162 SHADOW_ARGS_TO_STACK 6 163 SAVE_XMM 7 164 push rsi 165 push rdi 166 ; end prolog 167 168 GET_PARAM 169.loop: 170 movq xmm0, [rsi] ;0 171 movq xmm1, [rsi + rax] ;1 172 173 APPLY_FILTER_8 0 174 jnz .loop 175 176 ; begin epilog 177 pop rdi 178 pop rsi 179 RESTORE_XMM 180 UNSHADOW_ARGS 181 pop rbp 182 ret 183 184global sym(vpx_filter_block1d16_v2_sse2) PRIVATE 185sym(vpx_filter_block1d16_v2_sse2): 186 push rbp 187 mov rbp, rsp 188 SHADOW_ARGS_TO_STACK 6 189 SAVE_XMM 7 190 push rsi 191 push rdi 192 ; end prolog 193 194 GET_PARAM 195.loop: 196 movdqu xmm0, [rsi] ;0 197 movdqu xmm1, [rsi + rax] ;1 198 movdqa xmm2, xmm0 199 movdqa xmm3, xmm1 200 201 APPLY_FILTER_16 0 202 jnz .loop 203 204 ; begin epilog 205 pop rdi 206 pop rsi 207 RESTORE_XMM 208 UNSHADOW_ARGS 209 pop rbp 210 ret 211 212global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE 213sym(vpx_filter_block1d4_v2_avg_sse2): 214 push rbp 215 mov rbp, rsp 216 SHADOW_ARGS_TO_STACK 6 217 push rsi 218 push rdi 219 ; end prolog 220 221 GET_PARAM_4 222.loop: 223 movd xmm0, [rsi] ;load src 224 movd xmm1, [rsi + rax] 225 226 APPLY_FILTER_4 1 227 jnz .loop 228 229 ; begin epilog 230 pop rdi 231 pop rsi 232 UNSHADOW_ARGS 233 pop rbp 234 ret 235 236global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE 237sym(vpx_filter_block1d8_v2_avg_sse2): 238 push rbp 239 mov rbp, rsp 240 SHADOW_ARGS_TO_STACK 6 241 SAVE_XMM 7 242 push rsi 243 push rdi 244 ; end prolog 245 246 GET_PARAM 247.loop: 248 movq xmm0, [rsi] ;0 249 movq xmm1, [rsi + rax] ;1 250 251 APPLY_FILTER_8 1 252 jnz .loop 253 254 ; begin epilog 255 pop rdi 256 pop rsi 257 RESTORE_XMM 258 UNSHADOW_ARGS 259 pop rbp 260 ret 261 262global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE 263sym(vpx_filter_block1d16_v2_avg_sse2): 264 push rbp 265 mov rbp, rsp 266 SHADOW_ARGS_TO_STACK 6 267 SAVE_XMM 7 268 push rsi 269 push rdi 270 ; end prolog 271 272 GET_PARAM 273.loop: 274 movdqu xmm0, [rsi] ;0 275 movdqu xmm1, [rsi + rax] ;1 276 movdqa xmm2, xmm0 277 movdqa xmm3, xmm1 278 279 APPLY_FILTER_16 1 280 jnz .loop 281 282 ; begin epilog 283 pop rdi 284 pop rsi 285 RESTORE_XMM 286 UNSHADOW_ARGS 287 pop rbp 288 ret 289 290global sym(vpx_filter_block1d4_h2_sse2) PRIVATE 291sym(vpx_filter_block1d4_h2_sse2): 292 push rbp 293 mov rbp, rsp 294 SHADOW_ARGS_TO_STACK 6 295 push rsi 296 push rdi 297 ; end prolog 298 299 GET_PARAM_4 300.loop: 301 movdqu xmm0, [rsi] ;load src 302 movdqa xmm1, xmm0 303 psrldq xmm1, 1 304 305 APPLY_FILTER_4 0 306 jnz .loop 307 308 ; begin epilog 309 pop rdi 310 pop rsi 311 UNSHADOW_ARGS 312 pop rbp 313 ret 314 315global sym(vpx_filter_block1d8_h2_sse2) PRIVATE 316sym(vpx_filter_block1d8_h2_sse2): 317 push rbp 318 mov rbp, rsp 319 SHADOW_ARGS_TO_STACK 6 320 SAVE_XMM 7 321 push rsi 322 push rdi 323 ; end prolog 324 325 GET_PARAM 326.loop: 327 movdqu xmm0, [rsi] ;load src 328 movdqa xmm1, xmm0 329 psrldq xmm1, 1 330 331 APPLY_FILTER_8 0 332 jnz .loop 333 334 ; begin epilog 335 pop rdi 336 pop rsi 337 RESTORE_XMM 338 UNSHADOW_ARGS 339 pop rbp 340 ret 341 342global sym(vpx_filter_block1d16_h2_sse2) PRIVATE 343sym(vpx_filter_block1d16_h2_sse2): 344 push rbp 345 mov rbp, rsp 346 SHADOW_ARGS_TO_STACK 6 347 SAVE_XMM 7 348 push rsi 349 push rdi 350 ; end prolog 351 352 GET_PARAM 353.loop: 354 movdqu xmm0, [rsi] ;load src 355 movdqu xmm1, [rsi + 1] 356 movdqa xmm2, xmm0 357 movdqa xmm3, xmm1 358 359 APPLY_FILTER_16 0 360 jnz .loop 361 362 ; begin epilog 363 pop rdi 364 pop rsi 365 RESTORE_XMM 366 UNSHADOW_ARGS 367 pop rbp 368 ret 369 370global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE 371sym(vpx_filter_block1d4_h2_avg_sse2): 372 push rbp 373 mov rbp, rsp 374 SHADOW_ARGS_TO_STACK 6 375 push rsi 376 push rdi 377 ; end prolog 378 379 GET_PARAM_4 380.loop: 381 movdqu xmm0, [rsi] ;load src 382 movdqa xmm1, xmm0 383 psrldq xmm1, 1 384 385 APPLY_FILTER_4 1 386 jnz .loop 387 388 ; begin epilog 389 pop rdi 390 pop rsi 391 UNSHADOW_ARGS 392 pop rbp 393 ret 394 395global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE 396sym(vpx_filter_block1d8_h2_avg_sse2): 397 push rbp 398 mov rbp, rsp 399 SHADOW_ARGS_TO_STACK 6 400 SAVE_XMM 7 401 push rsi 402 push rdi 403 ; end prolog 404 405 GET_PARAM 406.loop: 407 movdqu xmm0, [rsi] ;load src 408 movdqa xmm1, xmm0 409 psrldq xmm1, 1 410 411 APPLY_FILTER_8 1 412 jnz .loop 413 414 ; begin epilog 415 pop rdi 416 pop rsi 417 RESTORE_XMM 418 UNSHADOW_ARGS 419 pop rbp 420 ret 421 422global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE 423sym(vpx_filter_block1d16_h2_avg_sse2): 424 push rbp 425 mov rbp, rsp 426 SHADOW_ARGS_TO_STACK 6 427 SAVE_XMM 7 428 push rsi 429 push rdi 430 ; end prolog 431 432 GET_PARAM 433.loop: 434 movdqu xmm0, [rsi] ;load src 435 movdqu xmm1, [rsi + 1] 436 movdqa xmm2, xmm0 437 movdqa xmm3, xmm1 438 439 APPLY_FILTER_16 1 440 jnz .loop 441 442 ; begin epilog 443 pop rdi 444 pop rsi 445 RESTORE_XMM 446 UNSHADOW_ARGS 447 pop rbp 448 ret 449