1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "vpx_ports/x86_abi_support.asm" 12 13%macro STACK_FRAME_CREATE_X3 0 14%if ABI_IS_32BIT 15 %define src_ptr rsi 16 %define src_stride rax 17 %define ref_ptr rdi 18 %define ref_stride rdx 19 %define end_ptr rcx 20 %define ret_var rbx 21 %define result_ptr arg(4) 22 %define max_err arg(4) 23 %define height dword ptr arg(4) 24 push rbp 25 mov rbp, rsp 26 push rsi 27 push rdi 28 push rbx 29 30 mov rsi, arg(0) ; src_ptr 31 mov rdi, arg(2) ; ref_ptr 32 33 movsxd rax, dword ptr arg(1) ; src_stride 34 movsxd rdx, dword ptr arg(3) ; ref_stride 35%else 36 %if LIBVPX_YASM_WIN64 37 SAVE_XMM 7, u 38 %define src_ptr rcx 39 %define src_stride rdx 40 %define ref_ptr r8 41 %define ref_stride r9 42 %define end_ptr r10 43 %define ret_var r11 44 %define result_ptr [rsp+xmm_stack_space+8+4*8] 45 %define max_err [rsp+xmm_stack_space+8+4*8] 46 %define height dword ptr [rsp+xmm_stack_space+8+4*8] 47 %else 48 %define src_ptr rdi 49 %define src_stride rsi 50 %define ref_ptr rdx 51 %define ref_stride rcx 52 %define end_ptr r9 53 %define ret_var r10 54 %define result_ptr r8 55 %define max_err r8 56 %define height r8 57 %endif 58%endif 59 60%endmacro 61 62%macro STACK_FRAME_DESTROY_X3 0 63 %define src_ptr 64 %define src_stride 65 %define ref_ptr 66 %define ref_stride 67 %define end_ptr 68 %define ret_var 69 %define result_ptr 70 %define max_err 71 %define height 72 73%if ABI_IS_32BIT 74 pop rbx 75 pop rdi 76 pop rsi 77 pop rbp 78%else 79 %if LIBVPX_YASM_WIN64 80 RESTORE_XMM 81 %endif 82%endif 83 ret 84%endmacro 85 86%macro PROCESS_16X2X3 5 87%if %1==0 88 movdqa xmm0, XMMWORD PTR [%2] 89 lddqu xmm5, XMMWORD PTR [%3] 90 lddqu xmm6, XMMWORD PTR [%3+1] 91 lddqu xmm7, XMMWORD PTR [%3+2] 92 93 psadbw xmm5, xmm0 94 psadbw xmm6, xmm0 95 psadbw xmm7, xmm0 96%else 97 movdqa xmm0, XMMWORD PTR [%2] 98 lddqu xmm1, XMMWORD PTR [%3] 99 lddqu xmm2, XMMWORD PTR [%3+1] 100 lddqu xmm3, XMMWORD PTR [%3+2] 101 102 psadbw xmm1, xmm0 103 psadbw xmm2, xmm0 104 psadbw xmm3, xmm0 105 106 paddw xmm5, xmm1 107 paddw xmm6, xmm2 108 paddw xmm7, xmm3 109%endif 110 movdqa xmm0, XMMWORD PTR [%2+%4] 111 lddqu xmm1, XMMWORD PTR [%3+%5] 112 lddqu xmm2, XMMWORD PTR [%3+%5+1] 113 lddqu xmm3, XMMWORD PTR [%3+%5+2] 114 115%if %1==0 || %1==1 116 lea %2, [%2+%4*2] 117 lea %3, [%3+%5*2] 118%endif 119 120 psadbw xmm1, xmm0 121 psadbw xmm2, xmm0 122 psadbw xmm3, xmm0 123 124 paddw xmm5, xmm1 125 paddw xmm6, xmm2 126 paddw xmm7, xmm3 127%endmacro 128 129%macro PROCESS_8X2X3 5 130%if %1==0 131 movq mm0, QWORD PTR [%2] 132 movq mm5, QWORD PTR [%3] 133 movq mm6, QWORD PTR [%3+1] 134 movq mm7, QWORD PTR [%3+2] 135 136 psadbw mm5, mm0 137 psadbw mm6, mm0 138 psadbw mm7, mm0 139%else 140 movq mm0, QWORD PTR [%2] 141 movq mm1, QWORD PTR [%3] 142 movq mm2, QWORD PTR [%3+1] 143 movq mm3, QWORD PTR [%3+2] 144 145 psadbw mm1, mm0 146 psadbw mm2, mm0 147 psadbw mm3, mm0 148 149 paddw mm5, mm1 150 paddw mm6, mm2 151 paddw mm7, mm3 152%endif 153 movq mm0, QWORD PTR [%2+%4] 154 movq mm1, QWORD PTR [%3+%5] 155 movq mm2, QWORD PTR [%3+%5+1] 156 movq mm3, QWORD PTR [%3+%5+2] 157 158%if %1==0 || %1==1 159 lea %2, [%2+%4*2] 160 lea %3, [%3+%5*2] 161%endif 162 163 psadbw mm1, mm0 164 psadbw mm2, mm0 165 psadbw mm3, mm0 166 167 paddw mm5, mm1 168 paddw mm6, mm2 169 paddw mm7, mm3 170%endmacro 171 172;void int vp9_sad16x16x3_sse3( 173; unsigned char *src_ptr, 174; int src_stride, 175; unsigned char *ref_ptr, 176; int ref_stride, 177; int *results) 178global sym(vp9_sad16x16x3_sse3) PRIVATE 179sym(vp9_sad16x16x3_sse3): 180 181 STACK_FRAME_CREATE_X3 182 183 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 184 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 185 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 186 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 187 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 188 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 189 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 190 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 191 192 mov rcx, result_ptr 193 194 movq xmm0, xmm5 195 psrldq xmm5, 8 196 197 paddw xmm0, xmm5 198 movd [rcx], xmm0 199;- 200 movq xmm0, xmm6 201 psrldq xmm6, 8 202 203 paddw xmm0, xmm6 204 movd [rcx+4], xmm0 205;- 206 movq xmm0, xmm7 207 psrldq xmm7, 8 208 209 paddw xmm0, xmm7 210 movd [rcx+8], xmm0 211 212 STACK_FRAME_DESTROY_X3 213 214;void int vp9_sad16x8x3_sse3( 215; unsigned char *src_ptr, 216; int src_stride, 217; unsigned char *ref_ptr, 218; int ref_stride, 219; int *results) 220global sym(vp9_sad16x8x3_sse3) PRIVATE 221sym(vp9_sad16x8x3_sse3): 222 223 STACK_FRAME_CREATE_X3 224 225 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 226 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 227 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 228 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 229 230 mov rcx, result_ptr 231 232 movq xmm0, xmm5 233 psrldq xmm5, 8 234 235 paddw xmm0, xmm5 236 movd [rcx], xmm0 237;- 238 movq xmm0, xmm6 239 psrldq xmm6, 8 240 241 paddw xmm0, xmm6 242 movd [rcx+4], xmm0 243;- 244 movq xmm0, xmm7 245 psrldq xmm7, 8 246 247 paddw xmm0, xmm7 248 movd [rcx+8], xmm0 249 250 STACK_FRAME_DESTROY_X3 251 252;void int vp9_sad8x16x3_sse3( 253; unsigned char *src_ptr, 254; int src_stride, 255; unsigned char *ref_ptr, 256; int ref_stride, 257; int *results) 258global sym(vp9_sad8x16x3_sse3) PRIVATE 259sym(vp9_sad8x16x3_sse3): 260 261 STACK_FRAME_CREATE_X3 262 263 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 264 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 265 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 266 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 267 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 268 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 269 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 270 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 271 272 mov rcx, result_ptr 273 274 punpckldq mm5, mm6 275 276 movq [rcx], mm5 277 movd [rcx+8], mm7 278 279 STACK_FRAME_DESTROY_X3 280 281;void int vp9_sad8x8x3_sse3( 282; unsigned char *src_ptr, 283; int src_stride, 284; unsigned char *ref_ptr, 285; int ref_stride, 286; int *results) 287global sym(vp9_sad8x8x3_sse3) PRIVATE 288sym(vp9_sad8x8x3_sse3): 289 290 STACK_FRAME_CREATE_X3 291 292 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 293 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 294 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 295 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 296 297 mov rcx, result_ptr 298 299 punpckldq mm5, mm6 300 301 movq [rcx], mm5 302 movd [rcx+8], mm7 303 304 STACK_FRAME_DESTROY_X3 305 306;void int vp9_sad4x4x3_sse3( 307; unsigned char *src_ptr, 308; int src_stride, 309; unsigned char *ref_ptr, 310; int ref_stride, 311; int *results) 312global sym(vp9_sad4x4x3_sse3) PRIVATE 313sym(vp9_sad4x4x3_sse3): 314 315 STACK_FRAME_CREATE_X3 316 317 movd mm0, DWORD PTR [src_ptr] 318 movd mm1, DWORD PTR [ref_ptr] 319 320 movd mm2, DWORD PTR [src_ptr+src_stride] 321 movd mm3, DWORD PTR [ref_ptr+ref_stride] 322 323 punpcklbw mm0, mm2 324 punpcklbw mm1, mm3 325 326 movd mm4, DWORD PTR [ref_ptr+1] 327 movd mm5, DWORD PTR [ref_ptr+2] 328 329 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] 330 movd mm3, DWORD PTR [ref_ptr+ref_stride+2] 331 332 psadbw mm1, mm0 333 334 punpcklbw mm4, mm2 335 punpcklbw mm5, mm3 336 337 psadbw mm4, mm0 338 psadbw mm5, mm0 339 340 lea src_ptr, [src_ptr+src_stride*2] 341 lea ref_ptr, [ref_ptr+ref_stride*2] 342 343 movd mm0, DWORD PTR [src_ptr] 344 movd mm2, DWORD PTR [ref_ptr] 345 346 movd mm3, DWORD PTR [src_ptr+src_stride] 347 movd mm6, DWORD PTR [ref_ptr+ref_stride] 348 349 punpcklbw mm0, mm3 350 punpcklbw mm2, mm6 351 352 movd mm3, DWORD PTR [ref_ptr+1] 353 movd mm7, DWORD PTR [ref_ptr+2] 354 355 psadbw mm2, mm0 356 357 paddw mm1, mm2 358 359 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] 360 movd mm6, DWORD PTR [ref_ptr+ref_stride+2] 361 362 punpcklbw mm3, mm2 363 punpcklbw mm7, mm6 364 365 psadbw mm3, mm0 366 psadbw mm7, mm0 367 368 paddw mm3, mm4 369 paddw mm7, mm5 370 371 mov rcx, result_ptr 372 373 punpckldq mm1, mm3 374 375 movq [rcx], mm1 376 movd [rcx+8], mm7 377 378 STACK_FRAME_DESTROY_X3 379