1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%macro PROCESS_16X2X8 1 15%if %1 16 movdqa xmm0, XMMWORD PTR [rsi] 17 movq xmm1, MMWORD PTR [rdi] 18 movq xmm3, MMWORD PTR [rdi+8] 19 movq xmm2, MMWORD PTR [rdi+16] 20 punpcklqdq xmm1, xmm3 21 punpcklqdq xmm3, xmm2 22 23 movdqa xmm2, xmm1 24 mpsadbw xmm1, xmm0, 0x0 25 mpsadbw xmm2, xmm0, 0x5 26 27 psrldq xmm0, 8 28 29 movdqa xmm4, xmm3 30 mpsadbw xmm3, xmm0, 0x0 31 mpsadbw xmm4, xmm0, 0x5 32 33 paddw xmm1, xmm2 34 paddw xmm1, xmm3 35 paddw xmm1, xmm4 36%else 37 movdqa xmm0, XMMWORD PTR [rsi] 38 movq xmm5, MMWORD PTR [rdi] 39 movq xmm3, MMWORD PTR [rdi+8] 40 movq xmm2, MMWORD PTR [rdi+16] 41 punpcklqdq xmm5, xmm3 42 punpcklqdq xmm3, xmm2 43 44 movdqa xmm2, xmm5 45 mpsadbw xmm5, xmm0, 0x0 46 mpsadbw xmm2, xmm0, 0x5 47 48 psrldq xmm0, 8 49 50 movdqa xmm4, xmm3 51 mpsadbw xmm3, xmm0, 0x0 52 mpsadbw xmm4, xmm0, 0x5 53 54 paddw xmm5, xmm2 55 paddw xmm5, xmm3 56 paddw xmm5, xmm4 57 58 paddw xmm1, xmm5 59%endif 60 movdqa xmm0, XMMWORD PTR [rsi + rax] 61 movq xmm5, MMWORD PTR [rdi+ rdx] 62 movq xmm3, MMWORD PTR [rdi+ rdx+8] 63 movq xmm2, MMWORD PTR [rdi+ rdx+16] 64 punpcklqdq xmm5, xmm3 65 punpcklqdq xmm3, xmm2 66 67 lea rsi, [rsi+rax*2] 68 lea rdi, [rdi+rdx*2] 69 70 movdqa xmm2, xmm5 71 mpsadbw xmm5, xmm0, 0x0 72 mpsadbw xmm2, xmm0, 0x5 73 74 psrldq xmm0, 8 75 movdqa xmm4, xmm3 76 mpsadbw xmm3, xmm0, 0x0 77 mpsadbw xmm4, xmm0, 0x5 78 79 paddw xmm5, xmm2 80 paddw xmm5, xmm3 81 paddw xmm5, xmm4 82 83 paddw xmm1, xmm5 84%endmacro 85 86%macro PROCESS_8X2X8 1 87%if %1 88 movq xmm0, MMWORD PTR [rsi] 89 movq xmm1, MMWORD PTR [rdi] 90 movq xmm3, MMWORD PTR [rdi+8] 91 punpcklqdq xmm1, xmm3 92 93 movdqa xmm2, xmm1 94 mpsadbw xmm1, xmm0, 0x0 95 mpsadbw xmm2, xmm0, 0x5 96 paddw xmm1, xmm2 97%else 98 movq xmm0, MMWORD PTR [rsi] 99 movq xmm5, MMWORD PTR [rdi] 100 movq xmm3, MMWORD PTR [rdi+8] 101 punpcklqdq xmm5, xmm3 102 103 movdqa xmm2, xmm5 104 mpsadbw xmm5, xmm0, 0x0 105 mpsadbw xmm2, xmm0, 0x5 106 paddw xmm5, xmm2 107 108 paddw xmm1, xmm5 109%endif 110 movq xmm0, MMWORD PTR [rsi + rax] 111 movq xmm5, MMWORD PTR [rdi+ rdx] 112 movq xmm3, MMWORD PTR [rdi+ rdx+8] 113 punpcklqdq xmm5, xmm3 114 115 lea rsi, [rsi+rax*2] 116 lea rdi, [rdi+rdx*2] 117 118 movdqa xmm2, xmm5 119 mpsadbw xmm5, xmm0, 0x0 120 mpsadbw xmm2, xmm0, 0x5 121 paddw xmm5, xmm2 122 123 paddw xmm1, xmm5 124%endmacro 125 126%macro PROCESS_4X2X8 1 127%if %1 128 movd xmm0, [rsi] 129 movq xmm1, MMWORD PTR [rdi] 130 movq xmm3, MMWORD PTR [rdi+8] 131 punpcklqdq xmm1, xmm3 132 133 mpsadbw xmm1, xmm0, 0x0 134%else 135 movd xmm0, [rsi] 136 movq xmm5, MMWORD PTR [rdi] 137 movq xmm3, MMWORD PTR [rdi+8] 138 punpcklqdq xmm5, xmm3 139 140 mpsadbw xmm5, xmm0, 0x0 141 142 paddw xmm1, xmm5 143%endif 144 movd xmm0, [rsi + rax] 145 movq xmm5, MMWORD PTR [rdi+ rdx] 146 movq xmm3, MMWORD PTR [rdi+ rdx+8] 147 punpcklqdq xmm5, xmm3 148 149 lea rsi, [rsi+rax*2] 150 lea rdi, [rdi+rdx*2] 151 152 mpsadbw xmm5, xmm0, 0x0 153 154 paddw xmm1, xmm5 155%endmacro 156 157%macro WRITE_AS_INTS 0 158 mov rdi, arg(4) ;Results 159 pxor xmm0, xmm0 160 movdqa xmm2, xmm1 161 punpcklwd xmm1, xmm0 162 punpckhwd xmm2, xmm0 163 164 movdqa [rdi], xmm1 165 movdqa [rdi + 16], xmm2 166%endmacro 167 168SECTION .text 169 170;void vpx_sad16x16x8_sse4_1( 171; const unsigned char *src_ptr, 172; int src_stride, 173; const unsigned char *ref_ptr, 174; int ref_stride, 175; unsigned short *sad_array); 176global sym(vpx_sad16x16x8_sse4_1) PRIVATE 177sym(vpx_sad16x16x8_sse4_1): 178 push rbp 179 mov rbp, rsp 180 SHADOW_ARGS_TO_STACK 5 181 push rsi 182 push rdi 183 ; end prolog 184 185 mov rsi, arg(0) ;src_ptr 186 mov rdi, arg(2) ;ref_ptr 187 188 movsxd rax, dword ptr arg(1) ;src_stride 189 movsxd rdx, dword ptr arg(3) ;ref_stride 190 191 PROCESS_16X2X8 1 192 PROCESS_16X2X8 0 193 PROCESS_16X2X8 0 194 PROCESS_16X2X8 0 195 PROCESS_16X2X8 0 196 PROCESS_16X2X8 0 197 PROCESS_16X2X8 0 198 PROCESS_16X2X8 0 199 200 WRITE_AS_INTS 201 202 ; begin epilog 203 pop rdi 204 pop rsi 205 UNSHADOW_ARGS 206 pop rbp 207 ret 208 209 210;void vpx_sad16x8x8_sse4_1( 211; const unsigned char *src_ptr, 212; int src_stride, 213; const unsigned char *ref_ptr, 214; int ref_stride, 215; unsigned short *sad_array 216;); 217global sym(vpx_sad16x8x8_sse4_1) PRIVATE 218sym(vpx_sad16x8x8_sse4_1): 219 push rbp 220 mov rbp, rsp 221 SHADOW_ARGS_TO_STACK 5 222 push rsi 223 push rdi 224 ; end prolog 225 226 mov rsi, arg(0) ;src_ptr 227 mov rdi, arg(2) ;ref_ptr 228 229 movsxd rax, dword ptr arg(1) ;src_stride 230 movsxd rdx, dword ptr arg(3) ;ref_stride 231 232 PROCESS_16X2X8 1 233 PROCESS_16X2X8 0 234 PROCESS_16X2X8 0 235 PROCESS_16X2X8 0 236 237 WRITE_AS_INTS 238 239 ; begin epilog 240 pop rdi 241 pop rsi 242 UNSHADOW_ARGS 243 pop rbp 244 ret 245 246 247;void vpx_sad8x8x8_sse4_1( 248; const unsigned char *src_ptr, 249; int src_stride, 250; const unsigned char *ref_ptr, 251; int ref_stride, 252; unsigned short *sad_array 253;); 254global sym(vpx_sad8x8x8_sse4_1) PRIVATE 255sym(vpx_sad8x8x8_sse4_1): 256 push rbp 257 mov rbp, rsp 258 SHADOW_ARGS_TO_STACK 5 259 push rsi 260 push rdi 261 ; end prolog 262 263 mov rsi, arg(0) ;src_ptr 264 mov rdi, arg(2) ;ref_ptr 265 266 movsxd rax, dword ptr arg(1) ;src_stride 267 movsxd rdx, dword ptr arg(3) ;ref_stride 268 269 PROCESS_8X2X8 1 270 PROCESS_8X2X8 0 271 PROCESS_8X2X8 0 272 PROCESS_8X2X8 0 273 274 WRITE_AS_INTS 275 276 ; begin epilog 277 pop rdi 278 pop rsi 279 UNSHADOW_ARGS 280 pop rbp 281 ret 282 283 284;void vpx_sad8x16x8_sse4_1( 285; const unsigned char *src_ptr, 286; int src_stride, 287; const unsigned char *ref_ptr, 288; int ref_stride, 289; unsigned short *sad_array 290;); 291global sym(vpx_sad8x16x8_sse4_1) PRIVATE 292sym(vpx_sad8x16x8_sse4_1): 293 push rbp 294 mov rbp, rsp 295 SHADOW_ARGS_TO_STACK 5 296 push rsi 297 push rdi 298 ; end prolog 299 300 mov rsi, arg(0) ;src_ptr 301 mov rdi, arg(2) ;ref_ptr 302 303 movsxd rax, dword ptr arg(1) ;src_stride 304 movsxd rdx, dword ptr arg(3) ;ref_stride 305 306 PROCESS_8X2X8 1 307 PROCESS_8X2X8 0 308 PROCESS_8X2X8 0 309 PROCESS_8X2X8 0 310 PROCESS_8X2X8 0 311 PROCESS_8X2X8 0 312 PROCESS_8X2X8 0 313 PROCESS_8X2X8 0 314 315 WRITE_AS_INTS 316 317 ; begin epilog 318 pop rdi 319 pop rsi 320 UNSHADOW_ARGS 321 pop rbp 322 ret 323 324 325;void vpx_sad4x4x8_sse4_1( 326; const unsigned char *src_ptr, 327; int src_stride, 328; const unsigned char *ref_ptr, 329; int ref_stride, 330; unsigned short *sad_array 331;); 332global sym(vpx_sad4x4x8_sse4_1) PRIVATE 333sym(vpx_sad4x4x8_sse4_1): 334 push rbp 335 mov rbp, rsp 336 SHADOW_ARGS_TO_STACK 5 337 push rsi 338 push rdi 339 ; end prolog 340 341 mov rsi, arg(0) ;src_ptr 342 mov rdi, arg(2) ;ref_ptr 343 344 movsxd rax, dword ptr arg(1) ;src_stride 345 movsxd rdx, dword ptr arg(3) ;ref_stride 346 347 PROCESS_4X2X8 1 348 PROCESS_4X2X8 0 349 350 WRITE_AS_INTS 351 352 ; begin epilog 353 pop rdi 354 pop rsi 355 UNSHADOW_ARGS 356 pop rbp 357 ret 358 359 360 361 362