1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%macro PROCESS_16X2X3 1 15%if %1 16 movdqa xmm0, XMMWORD PTR [rsi] 17 lddqu xmm5, XMMWORD PTR [rdi] 18 lddqu xmm6, XMMWORD PTR [rdi+1] 19 lddqu xmm7, XMMWORD PTR [rdi+2] 20 21 psadbw xmm5, xmm0 22 psadbw xmm6, xmm0 23 psadbw xmm7, xmm0 24%else 25 movdqa xmm0, XMMWORD PTR [rsi] 26 lddqu xmm1, XMMWORD PTR [rdi] 27 lddqu xmm2, XMMWORD PTR [rdi+1] 28 lddqu xmm3, XMMWORD PTR [rdi+2] 29 30 psadbw xmm1, xmm0 31 psadbw xmm2, xmm0 32 psadbw xmm3, xmm0 33 34 paddw xmm5, xmm1 35 paddw xmm6, xmm2 36 paddw xmm7, xmm3 37%endif 38 movdqa xmm0, XMMWORD PTR [rsi+rax] 39 lddqu xmm1, XMMWORD PTR [rdi+rdx] 40 lddqu xmm2, XMMWORD PTR [rdi+rdx+1] 41 lddqu xmm3, XMMWORD PTR [rdi+rdx+2] 42 43 lea rsi, [rsi+rax*2] 44 lea rdi, [rdi+rdx*2] 45 46 psadbw xmm1, xmm0 47 psadbw xmm2, xmm0 48 psadbw xmm3, xmm0 49 50 paddw xmm5, xmm1 51 paddw xmm6, xmm2 52 paddw xmm7, xmm3 53%endmacro 54 55%macro PROCESS_16X2X3_OFFSET 2 56%if %1 57 movdqa xmm0, XMMWORD PTR [rsi] 58 movdqa xmm4, XMMWORD PTR [rdi] 59 movdqa xmm7, XMMWORD PTR [rdi+16] 60 61 movdqa xmm5, xmm7 62 palignr xmm5, xmm4, %2 63 64 movdqa xmm6, xmm7 65 palignr xmm6, xmm4, (%2+1) 66 67 palignr xmm7, xmm4, (%2+2) 68 69 psadbw xmm5, xmm0 70 psadbw xmm6, xmm0 71 psadbw xmm7, xmm0 72%else 73 movdqa xmm0, XMMWORD PTR [rsi] 74 movdqa xmm4, XMMWORD PTR [rdi] 75 movdqa xmm3, XMMWORD PTR [rdi+16] 76 77 movdqa xmm1, xmm3 78 palignr xmm1, xmm4, %2 79 80 movdqa xmm2, xmm3 81 palignr xmm2, xmm4, (%2+1) 82 83 palignr xmm3, xmm4, (%2+2) 84 85 psadbw xmm1, xmm0 86 psadbw xmm2, xmm0 87 psadbw xmm3, xmm0 88 89 paddw xmm5, xmm1 90 paddw xmm6, xmm2 91 paddw xmm7, xmm3 92%endif 93 movdqa xmm0, XMMWORD PTR [rsi+rax] 94 movdqa xmm4, XMMWORD PTR [rdi+rdx] 95 movdqa xmm3, XMMWORD PTR [rdi+rdx+16] 96 97 movdqa xmm1, xmm3 98 palignr xmm1, xmm4, %2 99 100 movdqa xmm2, xmm3 101 palignr xmm2, xmm4, (%2+1) 102 103 palignr xmm3, xmm4, (%2+2) 104 105 lea rsi, [rsi+rax*2] 106 lea rdi, [rdi+rdx*2] 107 108 psadbw xmm1, xmm0 109 psadbw xmm2, xmm0 110 psadbw xmm3, xmm0 111 112 paddw xmm5, xmm1 113 paddw xmm6, xmm2 114 paddw xmm7, xmm3 115%endmacro 116 117%macro PROCESS_16X16X3_OFFSET 2 118%2_aligned_by_%1: 119 120 sub rdi, %1 121 122 PROCESS_16X2X3_OFFSET 1, %1 123 PROCESS_16X2X3_OFFSET 0, %1 124 PROCESS_16X2X3_OFFSET 0, %1 125 PROCESS_16X2X3_OFFSET 0, %1 126 PROCESS_16X2X3_OFFSET 0, %1 127 PROCESS_16X2X3_OFFSET 0, %1 128 PROCESS_16X2X3_OFFSET 0, %1 129 PROCESS_16X2X3_OFFSET 0, %1 130 131 jmp %2_store_off 132 133%endmacro 134 135%macro PROCESS_16X8X3_OFFSET 2 136%2_aligned_by_%1: 137 138 sub rdi, %1 139 140 PROCESS_16X2X3_OFFSET 1, %1 141 PROCESS_16X2X3_OFFSET 0, %1 142 PROCESS_16X2X3_OFFSET 0, %1 143 PROCESS_16X2X3_OFFSET 0, %1 144 145 jmp %2_store_off 146 147%endmacro 148 149SECTION .text 150 151;void int vpx_sad16x16x3_ssse3( 152; unsigned char *src_ptr, 153; int src_stride, 154; unsigned char *ref_ptr, 155; int ref_stride, 156; int *results) 157global sym(vpx_sad16x16x3_ssse3) PRIVATE 158sym(vpx_sad16x16x3_ssse3): 159 push rbp 160 mov rbp, rsp 161 SHADOW_ARGS_TO_STACK 5 162 SAVE_XMM 7 163 push rsi 164 push rdi 165 push rcx 166 ; end prolog 167 168 mov rsi, arg(0) ;src_ptr 169 mov rdi, arg(2) ;ref_ptr 170 171 mov rdx, 0xf 172 and rdx, rdi 173 174 jmp .vpx_sad16x16x3_ssse3_skiptable 175.vpx_sad16x16x3_ssse3_jumptable: 176 dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump 177 dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump 178 dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump 179 dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump 180 dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump 181 dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump 182 dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump 183 dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump 184 dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump 185 dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump 186 dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump 187 dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump 188 dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump 189 dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump 190 dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump 191 dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump 192.vpx_sad16x16x3_ssse3_skiptable: 193 194 call .vpx_sad16x16x3_ssse3_do_jump 195.vpx_sad16x16x3_ssse3_do_jump: 196 pop rcx ; get the address of do_jump 197 mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump 198 add rax, rcx ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable 199 200 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable 201 add rcx, rax 202 203 movsxd rax, dword ptr arg(1) ;src_stride 204 movsxd rdx, dword ptr arg(3) ;ref_stride 205 206 jmp rcx 207 208 PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3 209 PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3 210 PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3 211 PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3 212 PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3 213 PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3 214 PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3 215 PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3 216 PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3 217 PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3 218 PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3 219 PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3 220 PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3 221 PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3 222 PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3 223 224.vpx_sad16x16x3_ssse3_aligned_by_15: 225 PROCESS_16X2X3 1 226 PROCESS_16X2X3 0 227 PROCESS_16X2X3 0 228 PROCESS_16X2X3 0 229 PROCESS_16X2X3 0 230 PROCESS_16X2X3 0 231 PROCESS_16X2X3 0 232 PROCESS_16X2X3 0 233 234.vpx_sad16x16x3_ssse3_store_off: 235 mov rdi, arg(4) ;Results 236 237 movq xmm0, xmm5 238 psrldq xmm5, 8 239 240 paddw xmm0, xmm5 241 movd [rdi], xmm0 242;- 243 movq xmm0, xmm6 244 psrldq xmm6, 8 245 246 paddw xmm0, xmm6 247 movd [rdi+4], xmm0 248;- 249 movq xmm0, xmm7 250 psrldq xmm7, 8 251 252 paddw xmm0, xmm7 253 movd [rdi+8], xmm0 254 255 ; begin epilog 256 pop rcx 257 pop rdi 258 pop rsi 259 RESTORE_XMM 260 UNSHADOW_ARGS 261 pop rbp 262 ret 263 264;void int vpx_sad16x8x3_ssse3( 265; unsigned char *src_ptr, 266; int src_stride, 267; unsigned char *ref_ptr, 268; int ref_stride, 269; int *results) 270global sym(vpx_sad16x8x3_ssse3) PRIVATE 271sym(vpx_sad16x8x3_ssse3): 272 push rbp 273 mov rbp, rsp 274 SHADOW_ARGS_TO_STACK 5 275 SAVE_XMM 7 276 push rsi 277 push rdi 278 push rcx 279 ; end prolog 280 281 mov rsi, arg(0) ;src_ptr 282 mov rdi, arg(2) ;ref_ptr 283 284 mov rdx, 0xf 285 and rdx, rdi 286 287 jmp .vpx_sad16x8x3_ssse3_skiptable 288.vpx_sad16x8x3_ssse3_jumptable: 289 dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump 290 dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump 291 dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump 292 dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump 293 dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump 294 dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump 295 dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump 296 dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump 297 dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump 298 dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump 299 dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump 300 dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump 301 dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump 302 dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump 303 dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump 304 dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump 305.vpx_sad16x8x3_ssse3_skiptable: 306 307 call .vpx_sad16x8x3_ssse3_do_jump 308.vpx_sad16x8x3_ssse3_do_jump: 309 pop rcx ; get the address of do_jump 310 mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump 311 add rax, rcx ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable 312 313 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable 314 add rcx, rax 315 316 movsxd rax, dword ptr arg(1) ;src_stride 317 movsxd rdx, dword ptr arg(3) ;ref_stride 318 319 jmp rcx 320 321 PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3 322 PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3 323 PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3 324 PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3 325 PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3 326 PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3 327 PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3 328 PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3 329 PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3 330 PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3 331 PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3 332 PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3 333 PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3 334 PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3 335 PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3 336 337.vpx_sad16x8x3_ssse3_aligned_by_15: 338 339 PROCESS_16X2X3 1 340 PROCESS_16X2X3 0 341 PROCESS_16X2X3 0 342 PROCESS_16X2X3 0 343 344.vpx_sad16x8x3_ssse3_store_off: 345 mov rdi, arg(4) ;Results 346 347 movq xmm0, xmm5 348 psrldq xmm5, 8 349 350 paddw xmm0, xmm5 351 movd [rdi], xmm0 352;- 353 movq xmm0, xmm6 354 psrldq xmm6, 8 355 356 paddw xmm0, xmm6 357 movd [rdi+4], xmm0 358;- 359 movq xmm0, xmm7 360 psrldq xmm7, 8 361 362 paddw xmm0, xmm7 363 movd [rdi+8], xmm0 364 365 ; begin epilog 366 pop rcx 367 pop rdi 368 pop rsi 369 RESTORE_XMM 370 UNSHADOW_ARGS 371 pop rbp 372 ret 373