1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%macro PROCESS_16X2X8 1 15%if %1 16 movdqa xmm0, XMMWORD PTR [rsi] 17 movq xmm1, MMWORD PTR [rdi] 18 movq xmm3, MMWORD PTR [rdi+8] 19 movq xmm2, MMWORD PTR [rdi+16] 20 punpcklqdq xmm1, xmm3 21 punpcklqdq xmm3, xmm2 22 23 movdqa xmm2, xmm1 24 mpsadbw xmm1, xmm0, 0x0 25 mpsadbw xmm2, xmm0, 0x5 26 27 psrldq xmm0, 8 28 29 movdqa xmm4, xmm3 30 mpsadbw xmm3, xmm0, 0x0 31 mpsadbw xmm4, xmm0, 0x5 32 33 paddw xmm1, xmm2 34 paddw xmm1, xmm3 35 paddw xmm1, xmm4 36%else 37 movdqa xmm0, XMMWORD PTR [rsi] 38 movq xmm5, MMWORD PTR [rdi] 39 movq xmm3, MMWORD PTR [rdi+8] 40 movq xmm2, MMWORD PTR [rdi+16] 41 punpcklqdq xmm5, xmm3 42 punpcklqdq xmm3, xmm2 43 44 movdqa xmm2, xmm5 45 mpsadbw xmm5, xmm0, 0x0 46 mpsadbw xmm2, xmm0, 0x5 47 48 psrldq xmm0, 8 49 50 movdqa xmm4, xmm3 51 mpsadbw xmm3, xmm0, 0x0 52 mpsadbw xmm4, xmm0, 0x5 53 54 paddw xmm5, xmm2 55 paddw xmm5, xmm3 56 paddw xmm5, xmm4 57 58 paddw xmm1, xmm5 59%endif 60 movdqa xmm0, XMMWORD PTR [rsi + rax] 61 movq xmm5, MMWORD PTR [rdi+ rdx] 62 movq xmm3, MMWORD PTR [rdi+ rdx+8] 63 movq xmm2, MMWORD PTR [rdi+ rdx+16] 64 punpcklqdq xmm5, xmm3 65 punpcklqdq xmm3, xmm2 66 67 lea rsi, [rsi+rax*2] 68 lea rdi, [rdi+rdx*2] 69 70 movdqa xmm2, xmm5 71 mpsadbw xmm5, xmm0, 0x0 72 mpsadbw xmm2, xmm0, 0x5 73 74 psrldq xmm0, 8 75 movdqa xmm4, xmm3 76 mpsadbw xmm3, xmm0, 0x0 77 mpsadbw xmm4, xmm0, 0x5 78 79 paddw xmm5, xmm2 80 paddw xmm5, xmm3 81 paddw xmm5, xmm4 82 83 paddw xmm1, xmm5 84%endmacro 85 86%macro PROCESS_8X2X8 1 87%if %1 88 movq xmm0, MMWORD PTR [rsi] 89 movq xmm1, MMWORD PTR [rdi] 90 movq xmm3, MMWORD PTR [rdi+8] 91 punpcklqdq xmm1, xmm3 92 93 movdqa xmm2, xmm1 94 mpsadbw xmm1, xmm0, 0x0 95 mpsadbw xmm2, xmm0, 0x5 96 paddw xmm1, xmm2 97%else 98 movq xmm0, MMWORD PTR [rsi] 99 movq xmm5, MMWORD PTR [rdi] 100 movq xmm3, MMWORD PTR [rdi+8] 101 punpcklqdq xmm5, xmm3 102 103 movdqa xmm2, xmm5 104 mpsadbw xmm5, xmm0, 0x0 105 mpsadbw xmm2, xmm0, 0x5 106 paddw xmm5, xmm2 107 108 paddw xmm1, xmm5 109%endif 110 movq xmm0, MMWORD PTR [rsi + rax] 111 movq xmm5, MMWORD PTR [rdi+ rdx] 112 movq xmm3, MMWORD PTR [rdi+ rdx+8] 113 punpcklqdq xmm5, xmm3 114 115 lea rsi, [rsi+rax*2] 116 lea rdi, [rdi+rdx*2] 117 118 movdqa xmm2, xmm5 119 mpsadbw xmm5, xmm0, 0x0 120 mpsadbw xmm2, xmm0, 0x5 121 paddw xmm5, xmm2 122 123 paddw xmm1, xmm5 124%endmacro 125 126%macro PROCESS_4X2X8 1 127%if %1 128 movd xmm0, [rsi] 129 movq xmm1, MMWORD PTR [rdi] 130 movq xmm3, MMWORD PTR [rdi+8] 131 punpcklqdq xmm1, xmm3 132 133 mpsadbw xmm1, xmm0, 0x0 134%else 135 movd xmm0, [rsi] 136 movq xmm5, MMWORD PTR [rdi] 137 movq xmm3, MMWORD PTR [rdi+8] 138 punpcklqdq xmm5, xmm3 139 140 mpsadbw xmm5, xmm0, 0x0 141 142 paddw xmm1, xmm5 143%endif 144 movd xmm0, [rsi + rax] 145 movq xmm5, MMWORD PTR [rdi+ rdx] 146 movq xmm3, MMWORD PTR [rdi+ rdx+8] 147 punpcklqdq xmm5, xmm3 148 149 lea rsi, [rsi+rax*2] 150 lea rdi, [rdi+rdx*2] 151 152 mpsadbw xmm5, xmm0, 0x0 153 154 paddw xmm1, xmm5 155%endmacro 156 157%macro WRITE_AS_INTS 0 158 mov rdi, arg(4) ;Results 159 pxor xmm0, xmm0 160 movdqa xmm2, xmm1 161 punpcklwd xmm1, xmm0 162 punpckhwd xmm2, xmm0 163 164 movdqa [rdi], xmm1 165 movdqa [rdi + 16], xmm2 166%endmacro 167 168;void vpx_sad16x16x8_sse4_1( 169; const unsigned char *src_ptr, 170; int src_stride, 171; const unsigned char *ref_ptr, 172; int ref_stride, 173; unsigned short *sad_array); 174global sym(vpx_sad16x16x8_sse4_1) PRIVATE 175sym(vpx_sad16x16x8_sse4_1): 176 push rbp 177 mov rbp, rsp 178 SHADOW_ARGS_TO_STACK 5 179 push rsi 180 push rdi 181 ; end prolog 182 183 mov rsi, arg(0) ;src_ptr 184 mov rdi, arg(2) ;ref_ptr 185 186 movsxd rax, dword ptr arg(1) ;src_stride 187 movsxd rdx, dword ptr arg(3) ;ref_stride 188 189 PROCESS_16X2X8 1 190 PROCESS_16X2X8 0 191 PROCESS_16X2X8 0 192 PROCESS_16X2X8 0 193 PROCESS_16X2X8 0 194 PROCESS_16X2X8 0 195 PROCESS_16X2X8 0 196 PROCESS_16X2X8 0 197 198 WRITE_AS_INTS 199 200 ; begin epilog 201 pop rdi 202 pop rsi 203 UNSHADOW_ARGS 204 pop rbp 205 ret 206 207 208;void vpx_sad16x8x8_sse4_1( 209; const unsigned char *src_ptr, 210; int src_stride, 211; const unsigned char *ref_ptr, 212; int ref_stride, 213; unsigned short *sad_array 214;); 215global sym(vpx_sad16x8x8_sse4_1) PRIVATE 216sym(vpx_sad16x8x8_sse4_1): 217 push rbp 218 mov rbp, rsp 219 SHADOW_ARGS_TO_STACK 5 220 push rsi 221 push rdi 222 ; end prolog 223 224 mov rsi, arg(0) ;src_ptr 225 mov rdi, arg(2) ;ref_ptr 226 227 movsxd rax, dword ptr arg(1) ;src_stride 228 movsxd rdx, dword ptr arg(3) ;ref_stride 229 230 PROCESS_16X2X8 1 231 PROCESS_16X2X8 0 232 PROCESS_16X2X8 0 233 PROCESS_16X2X8 0 234 235 WRITE_AS_INTS 236 237 ; begin epilog 238 pop rdi 239 pop rsi 240 UNSHADOW_ARGS 241 pop rbp 242 ret 243 244 245;void vpx_sad8x8x8_sse4_1( 246; const unsigned char *src_ptr, 247; int src_stride, 248; const unsigned char *ref_ptr, 249; int ref_stride, 250; unsigned short *sad_array 251;); 252global sym(vpx_sad8x8x8_sse4_1) PRIVATE 253sym(vpx_sad8x8x8_sse4_1): 254 push rbp 255 mov rbp, rsp 256 SHADOW_ARGS_TO_STACK 5 257 push rsi 258 push rdi 259 ; end prolog 260 261 mov rsi, arg(0) ;src_ptr 262 mov rdi, arg(2) ;ref_ptr 263 264 movsxd rax, dword ptr arg(1) ;src_stride 265 movsxd rdx, dword ptr arg(3) ;ref_stride 266 267 PROCESS_8X2X8 1 268 PROCESS_8X2X8 0 269 PROCESS_8X2X8 0 270 PROCESS_8X2X8 0 271 272 WRITE_AS_INTS 273 274 ; begin epilog 275 pop rdi 276 pop rsi 277 UNSHADOW_ARGS 278 pop rbp 279 ret 280 281 282;void vpx_sad8x16x8_sse4_1( 283; const unsigned char *src_ptr, 284; int src_stride, 285; const unsigned char *ref_ptr, 286; int ref_stride, 287; unsigned short *sad_array 288;); 289global sym(vpx_sad8x16x8_sse4_1) PRIVATE 290sym(vpx_sad8x16x8_sse4_1): 291 push rbp 292 mov rbp, rsp 293 SHADOW_ARGS_TO_STACK 5 294 push rsi 295 push rdi 296 ; end prolog 297 298 mov rsi, arg(0) ;src_ptr 299 mov rdi, arg(2) ;ref_ptr 300 301 movsxd rax, dword ptr arg(1) ;src_stride 302 movsxd rdx, dword ptr arg(3) ;ref_stride 303 304 PROCESS_8X2X8 1 305 PROCESS_8X2X8 0 306 PROCESS_8X2X8 0 307 PROCESS_8X2X8 0 308 PROCESS_8X2X8 0 309 PROCESS_8X2X8 0 310 PROCESS_8X2X8 0 311 PROCESS_8X2X8 0 312 313 WRITE_AS_INTS 314 315 ; begin epilog 316 pop rdi 317 pop rsi 318 UNSHADOW_ARGS 319 pop rbp 320 ret 321 322 323;void vpx_sad4x4x8_sse4_1( 324; const unsigned char *src_ptr, 325; int src_stride, 326; const unsigned char *ref_ptr, 327; int ref_stride, 328; unsigned short *sad_array 329;); 330global sym(vpx_sad4x4x8_sse4_1) PRIVATE 331sym(vpx_sad4x4x8_sse4_1): 332 push rbp 333 mov rbp, rsp 334 SHADOW_ARGS_TO_STACK 5 335 push rsi 336 push rdi 337 ; end prolog 338 339 mov rsi, arg(0) ;src_ptr 340 mov rdi, arg(2) ;ref_ptr 341 342 movsxd rax, dword ptr arg(1) ;src_stride 343 movsxd rdx, dword ptr arg(3) ;ref_stride 344 345 PROCESS_4X2X8 1 346 PROCESS_4X2X8 0 347 348 WRITE_AS_INTS 349 350 ; begin epilog 351 pop rdi 352 pop rsi 353 UNSHADOW_ARGS 354 pop rbp 355 ret 356 357 358 359 360