1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "vpx_ports/x86_abi_support.asm" 12 13%macro STACK_FRAME_CREATE_X3 0 14%if ABI_IS_32BIT 15 %define src_ptr rsi 16 %define src_stride rax 17 %define ref_ptr rdi 18 %define ref_stride rdx 19 %define end_ptr rcx 20 %define ret_var rbx 21 %define result_ptr arg(4) 22 %define height dword ptr arg(4) 23 push rbp 24 mov rbp, rsp 25 push rsi 26 push rdi 27 push rbx 28 29 mov rsi, arg(0) ; src_ptr 30 mov rdi, arg(2) ; ref_ptr 31 32 movsxd rax, dword ptr arg(1) ; src_stride 33 movsxd rdx, dword ptr arg(3) ; ref_stride 34%else 35 %if LIBVPX_YASM_WIN64 36 SAVE_XMM 7, u 37 %define src_ptr rcx 38 %define src_stride rdx 39 %define ref_ptr r8 40 %define ref_stride r9 41 %define end_ptr r10 42 %define ret_var r11 43 %define result_ptr [rsp+xmm_stack_space+8+4*8] 44 %define height dword ptr [rsp+xmm_stack_space+8+4*8] 45 %else 46 %define src_ptr rdi 47 %define src_stride rsi 48 %define ref_ptr rdx 49 %define ref_stride rcx 50 %define end_ptr r9 51 %define ret_var r10 52 %define result_ptr r8 53 %define height r8 54 %endif 55%endif 56 57%endmacro 58 59%macro STACK_FRAME_DESTROY_X3 0 60 %define src_ptr 61 %define src_stride 62 %define ref_ptr 63 %define ref_stride 64 %define end_ptr 65 %define ret_var 66 %define result_ptr 67 %define height 68 69%if ABI_IS_32BIT 70 pop rbx 71 pop rdi 72 pop rsi 73 pop rbp 74%else 75 %if LIBVPX_YASM_WIN64 76 RESTORE_XMM 77 %endif 78%endif 79 ret 80%endmacro 81 82%macro PROCESS_16X2X3 5 83%if %1==0 84 movdqa xmm0, XMMWORD PTR [%2] 85 lddqu xmm5, XMMWORD PTR [%3] 86 lddqu xmm6, XMMWORD PTR [%3+1] 87 lddqu xmm7, XMMWORD PTR [%3+2] 88 89 psadbw xmm5, xmm0 90 psadbw xmm6, xmm0 91 psadbw xmm7, xmm0 92%else 93 movdqa xmm0, XMMWORD PTR [%2] 94 lddqu xmm1, XMMWORD PTR [%3] 95 lddqu xmm2, XMMWORD PTR [%3+1] 96 lddqu xmm3, XMMWORD PTR [%3+2] 97 98 psadbw xmm1, xmm0 99 psadbw xmm2, xmm0 100 psadbw xmm3, xmm0 101 102 paddw xmm5, xmm1 103 paddw xmm6, xmm2 104 paddw xmm7, xmm3 105%endif 106 movdqa xmm0, XMMWORD PTR [%2+%4] 107 lddqu xmm1, XMMWORD PTR [%3+%5] 108 lddqu xmm2, XMMWORD PTR [%3+%5+1] 109 lddqu xmm3, XMMWORD PTR [%3+%5+2] 110 111%if %1==0 || %1==1 112 lea %2, [%2+%4*2] 113 lea %3, [%3+%5*2] 114%endif 115 116 psadbw xmm1, xmm0 117 psadbw xmm2, xmm0 118 psadbw xmm3, xmm0 119 120 paddw xmm5, xmm1 121 paddw xmm6, xmm2 122 paddw xmm7, xmm3 123%endmacro 124 125%macro PROCESS_8X2X3 5 126%if %1==0 127 movq mm0, QWORD PTR [%2] 128 movq mm5, QWORD PTR [%3] 129 movq mm6, QWORD PTR [%3+1] 130 movq mm7, QWORD PTR [%3+2] 131 132 psadbw mm5, mm0 133 psadbw mm6, mm0 134 psadbw mm7, mm0 135%else 136 movq mm0, QWORD PTR [%2] 137 movq mm1, QWORD PTR [%3] 138 movq mm2, QWORD PTR [%3+1] 139 movq mm3, QWORD PTR [%3+2] 140 141 psadbw mm1, mm0 142 psadbw mm2, mm0 143 psadbw mm3, mm0 144 145 paddw mm5, mm1 146 paddw mm6, mm2 147 paddw mm7, mm3 148%endif 149 movq mm0, QWORD PTR [%2+%4] 150 movq mm1, QWORD PTR [%3+%5] 151 movq mm2, QWORD PTR [%3+%5+1] 152 movq mm3, QWORD PTR [%3+%5+2] 153 154%if %1==0 || %1==1 155 lea %2, [%2+%4*2] 156 lea %3, [%3+%5*2] 157%endif 158 159 psadbw mm1, mm0 160 psadbw mm2, mm0 161 psadbw mm3, mm0 162 163 paddw mm5, mm1 164 paddw mm6, mm2 165 paddw mm7, mm3 166%endmacro 167 168;void int vpx_sad16x16x3_sse3( 169; unsigned char *src_ptr, 170; int src_stride, 171; unsigned char *ref_ptr, 172; int ref_stride, 173; int *results) 174global sym(vpx_sad16x16x3_sse3) PRIVATE 175sym(vpx_sad16x16x3_sse3): 176 177 STACK_FRAME_CREATE_X3 178 179 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 180 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 181 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 182 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 183 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 184 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 185 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 186 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 187 188 mov rcx, result_ptr 189 190 movq xmm0, xmm5 191 psrldq xmm5, 8 192 193 paddw xmm0, xmm5 194 movd [rcx], xmm0 195;- 196 movq xmm0, xmm6 197 psrldq xmm6, 8 198 199 paddw xmm0, xmm6 200 movd [rcx+4], xmm0 201;- 202 movq xmm0, xmm7 203 psrldq xmm7, 8 204 205 paddw xmm0, xmm7 206 movd [rcx+8], xmm0 207 208 STACK_FRAME_DESTROY_X3 209 210;void int vpx_sad16x8x3_sse3( 211; unsigned char *src_ptr, 212; int src_stride, 213; unsigned char *ref_ptr, 214; int ref_stride, 215; int *results) 216global sym(vpx_sad16x8x3_sse3) PRIVATE 217sym(vpx_sad16x8x3_sse3): 218 219 STACK_FRAME_CREATE_X3 220 221 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 222 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 223 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 224 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 225 226 mov rcx, result_ptr 227 228 movq xmm0, xmm5 229 psrldq xmm5, 8 230 231 paddw xmm0, xmm5 232 movd [rcx], xmm0 233;- 234 movq xmm0, xmm6 235 psrldq xmm6, 8 236 237 paddw xmm0, xmm6 238 movd [rcx+4], xmm0 239;- 240 movq xmm0, xmm7 241 psrldq xmm7, 8 242 243 paddw xmm0, xmm7 244 movd [rcx+8], xmm0 245 246 STACK_FRAME_DESTROY_X3 247 248;void int vpx_sad8x16x3_sse3( 249; unsigned char *src_ptr, 250; int src_stride, 251; unsigned char *ref_ptr, 252; int ref_stride, 253; int *results) 254global sym(vpx_sad8x16x3_sse3) PRIVATE 255sym(vpx_sad8x16x3_sse3): 256 257 STACK_FRAME_CREATE_X3 258 259 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 260 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 261 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 262 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 263 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 264 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 265 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 266 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 267 268 mov rcx, result_ptr 269 270 punpckldq mm5, mm6 271 272 movq [rcx], mm5 273 movd [rcx+8], mm7 274 275 STACK_FRAME_DESTROY_X3 276 277;void int vpx_sad8x8x3_sse3( 278; unsigned char *src_ptr, 279; int src_stride, 280; unsigned char *ref_ptr, 281; int ref_stride, 282; int *results) 283global sym(vpx_sad8x8x3_sse3) PRIVATE 284sym(vpx_sad8x8x3_sse3): 285 286 STACK_FRAME_CREATE_X3 287 288 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 289 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 290 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 291 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 292 293 mov rcx, result_ptr 294 295 punpckldq mm5, mm6 296 297 movq [rcx], mm5 298 movd [rcx+8], mm7 299 300 STACK_FRAME_DESTROY_X3 301 302;void int vpx_sad4x4x3_sse3( 303; unsigned char *src_ptr, 304; int src_stride, 305; unsigned char *ref_ptr, 306; int ref_stride, 307; int *results) 308global sym(vpx_sad4x4x3_sse3) PRIVATE 309sym(vpx_sad4x4x3_sse3): 310 311 STACK_FRAME_CREATE_X3 312 313 movd mm0, DWORD PTR [src_ptr] 314 movd mm1, DWORD PTR [ref_ptr] 315 316 movd mm2, DWORD PTR [src_ptr+src_stride] 317 movd mm3, DWORD PTR [ref_ptr+ref_stride] 318 319 punpcklbw mm0, mm2 320 punpcklbw mm1, mm3 321 322 movd mm4, DWORD PTR [ref_ptr+1] 323 movd mm5, DWORD PTR [ref_ptr+2] 324 325 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] 326 movd mm3, DWORD PTR [ref_ptr+ref_stride+2] 327 328 psadbw mm1, mm0 329 330 punpcklbw mm4, mm2 331 punpcklbw mm5, mm3 332 333 psadbw mm4, mm0 334 psadbw mm5, mm0 335 336 lea src_ptr, [src_ptr+src_stride*2] 337 lea ref_ptr, [ref_ptr+ref_stride*2] 338 339 movd mm0, DWORD PTR [src_ptr] 340 movd mm2, DWORD PTR [ref_ptr] 341 342 movd mm3, DWORD PTR [src_ptr+src_stride] 343 movd mm6, DWORD PTR [ref_ptr+ref_stride] 344 345 punpcklbw mm0, mm3 346 punpcklbw mm2, mm6 347 348 movd mm3, DWORD PTR [ref_ptr+1] 349 movd mm7, DWORD PTR [ref_ptr+2] 350 351 psadbw mm2, mm0 352 353 paddw mm1, mm2 354 355 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] 356 movd mm6, DWORD PTR [ref_ptr+ref_stride+2] 357 358 punpcklbw mm3, mm2 359 punpcklbw mm7, mm6 360 361 psadbw mm3, mm0 362 psadbw mm7, mm0 363 364 paddw mm3, mm4 365 paddw mm7, mm5 366 367 mov rcx, result_ptr 368 369 punpckldq mm1, mm3 370 371 movq [rcx], mm1 372 movd [rcx+8], mm7 373 374 STACK_FRAME_DESTROY_X3 375