1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "vpx_ports/x86_abi_support.asm" 12 13%macro STACK_FRAME_CREATE_X3 0 14%if ABI_IS_32BIT 15 %define src_ptr rsi 16 %define src_stride rax 17 %define ref_ptr rdi 18 %define ref_stride rdx 19 %define end_ptr rcx 20 %define ret_var rbx 21 %define result_ptr arg(4) 22 %define height dword ptr arg(4) 23 push rbp 24 mov rbp, rsp 25 push rsi 26 push rdi 27 push rbx 28 29 mov rsi, arg(0) ; src_ptr 30 mov rdi, arg(2) ; ref_ptr 31 32 movsxd rax, dword ptr arg(1) ; src_stride 33 movsxd rdx, dword ptr arg(3) ; ref_stride 34%else 35 %if LIBVPX_YASM_WIN64 36 SAVE_XMM 7, u 37 %define src_ptr rcx 38 %define src_stride rdx 39 %define ref_ptr r8 40 %define ref_stride r9 41 %define end_ptr r10 42 %define ret_var r11 43 %define result_ptr [rsp+xmm_stack_space+8+4*8] 44 %define height dword ptr [rsp+xmm_stack_space+8+4*8] 45 %else 46 %define src_ptr rdi 47 %define src_stride rsi 48 %define ref_ptr rdx 49 %define ref_stride rcx 50 %define end_ptr r9 51 %define ret_var r10 52 %define result_ptr r8 53 %define height r8 54 %endif 55%endif 56 57%endmacro 58 59%macro STACK_FRAME_DESTROY_X3 0 60 %define src_ptr 61 %define src_stride 62 %define ref_ptr 63 %define ref_stride 64 %define end_ptr 65 %define ret_var 66 %define result_ptr 67 %define height 68 69%if ABI_IS_32BIT 70 pop rbx 71 pop rdi 72 pop rsi 73 pop rbp 74%else 75 %if LIBVPX_YASM_WIN64 76 RESTORE_XMM 77 %endif 78%endif 79 ret 80%endmacro 81 82%macro PROCESS_16X2X3 5 83%if %1==0 84 movdqa xmm0, XMMWORD PTR [%2] 85 lddqu xmm5, XMMWORD PTR [%3] 86 lddqu xmm6, XMMWORD PTR [%3+1] 87 lddqu xmm7, XMMWORD PTR [%3+2] 88 89 psadbw xmm5, xmm0 90 psadbw xmm6, xmm0 91 psadbw xmm7, xmm0 92%else 93 movdqa xmm0, XMMWORD PTR [%2] 94 lddqu xmm1, XMMWORD PTR [%3] 95 lddqu xmm2, XMMWORD PTR [%3+1] 96 lddqu xmm3, XMMWORD PTR [%3+2] 97 98 psadbw xmm1, xmm0 99 psadbw xmm2, xmm0 100 psadbw xmm3, xmm0 101 102 paddw xmm5, xmm1 103 paddw xmm6, xmm2 104 paddw xmm7, xmm3 105%endif 106 movdqa xmm0, XMMWORD PTR [%2+%4] 107 lddqu xmm1, XMMWORD PTR [%3+%5] 108 lddqu xmm2, XMMWORD PTR [%3+%5+1] 109 lddqu xmm3, XMMWORD PTR [%3+%5+2] 110 111%if %1==0 || %1==1 112 lea %2, [%2+%4*2] 113 lea %3, [%3+%5*2] 114%endif 115 116 psadbw xmm1, xmm0 117 psadbw xmm2, xmm0 118 psadbw xmm3, xmm0 119 120 paddw xmm5, xmm1 121 paddw xmm6, xmm2 122 paddw xmm7, xmm3 123%endmacro 124 125%macro PROCESS_8X2X3 5 126%if %1==0 127 movq mm0, QWORD PTR [%2] 128 movq mm5, QWORD PTR [%3] 129 movq mm6, QWORD PTR [%3+1] 130 movq mm7, QWORD PTR [%3+2] 131 132 psadbw mm5, mm0 133 psadbw mm6, mm0 134 psadbw mm7, mm0 135%else 136 movq mm0, QWORD PTR [%2] 137 movq mm1, QWORD PTR [%3] 138 movq mm2, QWORD PTR [%3+1] 139 movq mm3, QWORD PTR [%3+2] 140 141 psadbw mm1, mm0 142 psadbw mm2, mm0 143 psadbw mm3, mm0 144 145 paddw mm5, mm1 146 paddw mm6, mm2 147 paddw mm7, mm3 148%endif 149 movq mm0, QWORD PTR [%2+%4] 150 movq mm1, QWORD PTR [%3+%5] 151 movq mm2, QWORD PTR [%3+%5+1] 152 movq mm3, QWORD PTR [%3+%5+2] 153 154%if %1==0 || %1==1 155 lea %2, [%2+%4*2] 156 lea %3, [%3+%5*2] 157%endif 158 159 psadbw mm1, mm0 160 psadbw mm2, mm0 161 psadbw mm3, mm0 162 163 paddw mm5, mm1 164 paddw mm6, mm2 165 paddw mm7, mm3 166%endmacro 167 168SECTION .text 169 170;void int vpx_sad16x16x3_sse3( 171; unsigned char *src_ptr, 172; int src_stride, 173; unsigned char *ref_ptr, 174; int ref_stride, 175; int *results) 176global sym(vpx_sad16x16x3_sse3) PRIVATE 177sym(vpx_sad16x16x3_sse3): 178 179 STACK_FRAME_CREATE_X3 180 181 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 182 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 183 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 184 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 185 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 186 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 187 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 188 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 189 190 mov rcx, result_ptr 191 192 movq xmm0, xmm5 193 psrldq xmm5, 8 194 195 paddw xmm0, xmm5 196 movd [rcx], xmm0 197;- 198 movq xmm0, xmm6 199 psrldq xmm6, 8 200 201 paddw xmm0, xmm6 202 movd [rcx+4], xmm0 203;- 204 movq xmm0, xmm7 205 psrldq xmm7, 8 206 207 paddw xmm0, xmm7 208 movd [rcx+8], xmm0 209 210 STACK_FRAME_DESTROY_X3 211 212;void int vpx_sad16x8x3_sse3( 213; unsigned char *src_ptr, 214; int src_stride, 215; unsigned char *ref_ptr, 216; int ref_stride, 217; int *results) 218global sym(vpx_sad16x8x3_sse3) PRIVATE 219sym(vpx_sad16x8x3_sse3): 220 221 STACK_FRAME_CREATE_X3 222 223 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 224 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 225 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 226 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 227 228 mov rcx, result_ptr 229 230 movq xmm0, xmm5 231 psrldq xmm5, 8 232 233 paddw xmm0, xmm5 234 movd [rcx], xmm0 235;- 236 movq xmm0, xmm6 237 psrldq xmm6, 8 238 239 paddw xmm0, xmm6 240 movd [rcx+4], xmm0 241;- 242 movq xmm0, xmm7 243 psrldq xmm7, 8 244 245 paddw xmm0, xmm7 246 movd [rcx+8], xmm0 247 248 STACK_FRAME_DESTROY_X3 249 250;void int vpx_sad8x16x3_sse3( 251; unsigned char *src_ptr, 252; int src_stride, 253; unsigned char *ref_ptr, 254; int ref_stride, 255; int *results) 256global sym(vpx_sad8x16x3_sse3) PRIVATE 257sym(vpx_sad8x16x3_sse3): 258 259 STACK_FRAME_CREATE_X3 260 261 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 262 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 263 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 264 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 265 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 266 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 267 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 268 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 269 270 mov rcx, result_ptr 271 272 punpckldq mm5, mm6 273 274 movq [rcx], mm5 275 movd [rcx+8], mm7 276 277 STACK_FRAME_DESTROY_X3 278 279;void int vpx_sad8x8x3_sse3( 280; unsigned char *src_ptr, 281; int src_stride, 282; unsigned char *ref_ptr, 283; int ref_stride, 284; int *results) 285global sym(vpx_sad8x8x3_sse3) PRIVATE 286sym(vpx_sad8x8x3_sse3): 287 288 STACK_FRAME_CREATE_X3 289 290 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 291 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 292 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 293 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 294 295 mov rcx, result_ptr 296 297 punpckldq mm5, mm6 298 299 movq [rcx], mm5 300 movd [rcx+8], mm7 301 302 STACK_FRAME_DESTROY_X3 303 304;void int vpx_sad4x4x3_sse3( 305; unsigned char *src_ptr, 306; int src_stride, 307; unsigned char *ref_ptr, 308; int ref_stride, 309; int *results) 310global sym(vpx_sad4x4x3_sse3) PRIVATE 311sym(vpx_sad4x4x3_sse3): 312 313 STACK_FRAME_CREATE_X3 314 315 movd mm0, DWORD PTR [src_ptr] 316 movd mm1, DWORD PTR [ref_ptr] 317 318 movd mm2, DWORD PTR [src_ptr+src_stride] 319 movd mm3, DWORD PTR [ref_ptr+ref_stride] 320 321 punpcklbw mm0, mm2 322 punpcklbw mm1, mm3 323 324 movd mm4, DWORD PTR [ref_ptr+1] 325 movd mm5, DWORD PTR [ref_ptr+2] 326 327 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] 328 movd mm3, DWORD PTR [ref_ptr+ref_stride+2] 329 330 psadbw mm1, mm0 331 332 punpcklbw mm4, mm2 333 punpcklbw mm5, mm3 334 335 psadbw mm4, mm0 336 psadbw mm5, mm0 337 338 lea src_ptr, [src_ptr+src_stride*2] 339 lea ref_ptr, [ref_ptr+ref_stride*2] 340 341 movd mm0, DWORD PTR [src_ptr] 342 movd mm2, DWORD PTR [ref_ptr] 343 344 movd mm3, DWORD PTR [src_ptr+src_stride] 345 movd mm6, DWORD PTR [ref_ptr+ref_stride] 346 347 punpcklbw mm0, mm3 348 punpcklbw mm2, mm6 349 350 movd mm3, DWORD PTR [ref_ptr+1] 351 movd mm7, DWORD PTR [ref_ptr+2] 352 353 psadbw mm2, mm0 354 355 paddw mm1, mm2 356 357 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] 358 movd mm6, DWORD PTR [ref_ptr+ref_stride+2] 359 360 punpcklbw mm3, mm2 361 punpcklbw mm7, mm6 362 363 psadbw mm3, mm0 364 psadbw mm7, mm0 365 366 paddw mm3, mm4 367 paddw mm7, mm5 368 369 mov rcx, result_ptr 370 371 punpckldq mm1, mm3 372 373 movq [rcx], mm1 374 movd [rcx+8], mm7 375 376 STACK_FRAME_DESTROY_X3 377