1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;unsigned int vpx_highbd_calc16x16var_sse2 15;( 16; unsigned char * src_ptr, 17; int source_stride, 18; unsigned char * ref_ptr, 19; int recon_stride, 20; unsigned int * SSE, 21; int * Sum 22;) 23global sym(vpx_highbd_calc16x16var_sse2) PRIVATE 24sym(vpx_highbd_calc16x16var_sse2): 25 push rbp 26 mov rbp, rsp 27 SHADOW_ARGS_TO_STACK 6 28 SAVE_XMM 7 29 push rbx 30 push rsi 31 push rdi 32 ; end prolog 33 34 mov rsi, arg(0) ;[src_ptr] 35 mov rdi, arg(2) ;[ref_ptr] 36 37 movsxd rax, DWORD PTR arg(1) ;[source_stride] 38 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 39 add rax, rax ; source stride in bytes 40 add rdx, rdx ; recon stride in bytes 41 42 ; Prefetch data 43 prefetcht0 [rsi] 44 prefetcht0 [rsi+16] 45 prefetcht0 [rsi+rax] 46 prefetcht0 [rsi+rax+16] 47 lea rbx, [rsi+rax*2] 48 prefetcht0 [rbx] 49 prefetcht0 [rbx+16] 50 prefetcht0 [rbx+rax] 51 prefetcht0 [rbx+rax+16] 52 53 prefetcht0 [rdi] 54 prefetcht0 [rdi+16] 55 prefetcht0 [rdi+rdx] 56 prefetcht0 [rdi+rdx+16] 57 lea rbx, [rdi+rdx*2] 58 prefetcht0 [rbx] 59 prefetcht0 [rbx+16] 60 prefetcht0 [rbx+rdx] 61 prefetcht0 [rbx+rdx+16] 62 63 pxor xmm0, xmm0 ; clear xmm0 for unpack 64 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 65 66 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse 67 mov rcx, 16 68 69.var16loop: 70 movdqu xmm1, XMMWORD PTR [rsi] 71 movdqu xmm2, XMMWORD PTR [rdi] 72 73 lea rbx, [rsi+rax*2] 74 prefetcht0 [rbx] 75 prefetcht0 [rbx+16] 76 prefetcht0 [rbx+rax] 77 prefetcht0 [rbx+rax+16] 78 lea rbx, [rdi+rdx*2] 79 prefetcht0 [rbx] 80 prefetcht0 [rbx+16] 81 prefetcht0 [rbx+rdx] 82 prefetcht0 [rbx+rdx+16] 83 84 pxor xmm5, xmm5 85 86 psubw xmm1, xmm2 87 movdqu xmm3, XMMWORD PTR [rsi+16] 88 paddw xmm5, xmm1 89 pmaddwd xmm1, xmm1 90 movdqu xmm2, XMMWORD PTR [rdi+16] 91 paddd xmm6, xmm1 92 93 psubw xmm3, xmm2 94 movdqu xmm1, XMMWORD PTR [rsi+rax] 95 paddw xmm5, xmm3 96 pmaddwd xmm3, xmm3 97 movdqu xmm2, XMMWORD PTR [rdi+rdx] 98 paddd xmm6, xmm3 99 100 psubw xmm1, xmm2 101 movdqu xmm3, XMMWORD PTR [rsi+rax+16] 102 paddw xmm5, xmm1 103 pmaddwd xmm1, xmm1 104 movdqu xmm2, XMMWORD PTR [rdi+rdx+16] 105 paddd xmm6, xmm1 106 107 psubw xmm3, xmm2 108 paddw xmm5, xmm3 109 pmaddwd xmm3, xmm3 110 paddd xmm6, xmm3 111 112 movdqa xmm1, xmm5 113 movdqa xmm2, xmm5 114 pcmpgtw xmm1, xmm0 115 pcmpeqw xmm2, xmm0 116 por xmm1, xmm2 117 pcmpeqw xmm1, xmm0 118 movdqa xmm2, xmm5 119 punpcklwd xmm5, xmm1 120 punpckhwd xmm2, xmm1 121 paddd xmm7, xmm5 122 paddd xmm7, xmm2 123 124 lea rsi, [rsi + 2*rax] 125 lea rdi, [rdi + 2*rdx] 126 sub rcx, 2 127 jnz .var16loop 128 129 movdqa xmm4, xmm6 130 punpckldq xmm6, xmm0 131 132 punpckhdq xmm4, xmm0 133 movdqa xmm5, xmm7 134 135 paddd xmm6, xmm4 136 punpckldq xmm7, xmm0 137 138 punpckhdq xmm5, xmm0 139 paddd xmm7, xmm5 140 141 movdqa xmm4, xmm6 142 movdqa xmm5, xmm7 143 144 psrldq xmm4, 8 145 psrldq xmm5, 8 146 147 paddd xmm6, xmm4 148 paddd xmm7, xmm5 149 150 mov rdi, arg(4) ; [SSE] 151 mov rax, arg(5) ; [Sum] 152 153 movd DWORD PTR [rdi], xmm6 154 movd DWORD PTR [rax], xmm7 155 156 157 ; begin epilog 158 pop rdi 159 pop rsi 160 pop rbx 161 RESTORE_XMM 162 UNSHADOW_ARGS 163 pop rbp 164 ret 165 166 167;unsigned int vpx_highbd_calc8x8var_sse2 168;( 169; unsigned char * src_ptr, 170; int source_stride, 171; unsigned char * ref_ptr, 172; int recon_stride, 173; unsigned int * SSE, 174; int * Sum 175;) 176global sym(vpx_highbd_calc8x8var_sse2) PRIVATE 177sym(vpx_highbd_calc8x8var_sse2): 178 push rbp 179 mov rbp, rsp 180 SHADOW_ARGS_TO_STACK 6 181 SAVE_XMM 7 182 push rbx 183 push rsi 184 push rdi 185 ; end prolog 186 187 mov rsi, arg(0) ;[src_ptr] 188 mov rdi, arg(2) ;[ref_ptr] 189 190 movsxd rax, DWORD PTR arg(1) ;[source_stride] 191 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 192 add rax, rax ; source stride in bytes 193 add rdx, rdx ; recon stride in bytes 194 195 ; Prefetch data 196 prefetcht0 [rsi] 197 prefetcht0 [rsi+rax] 198 lea rbx, [rsi+rax*2] 199 prefetcht0 [rbx] 200 prefetcht0 [rbx+rax] 201 202 prefetcht0 [rdi] 203 prefetcht0 [rdi+rdx] 204 lea rbx, [rdi+rdx*2] 205 prefetcht0 [rbx] 206 prefetcht0 [rbx+rdx] 207 208 pxor xmm0, xmm0 ; clear xmm0 for unpack 209 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 210 211 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse 212 mov rcx, 8 213 214.var8loop: 215 movdqu xmm1, XMMWORD PTR [rsi] 216 movdqu xmm2, XMMWORD PTR [rdi] 217 218 lea rbx, [rsi+rax*4] 219 prefetcht0 [rbx] 220 prefetcht0 [rbx+rax] 221 lea rbx, [rbx+rax*2] 222 prefetcht0 [rbx] 223 prefetcht0 [rbx+rax] 224 lea rbx, [rdi+rdx*4] 225 prefetcht0 [rbx] 226 prefetcht0 [rbx+rdx] 227 lea rbx, [rbx+rdx*2] 228 prefetcht0 [rbx] 229 prefetcht0 [rbx+rdx] 230 231 pxor xmm5, xmm5 232 233 psubw xmm1, xmm2 234 movdqu xmm3, XMMWORD PTR [rsi+rax] 235 paddw xmm5, xmm1 236 pmaddwd xmm1, xmm1 237 movdqu xmm2, XMMWORD PTR [rdi+rdx] 238 paddd xmm6, xmm1 239 240 lea rsi, [rsi + 2*rax] 241 lea rdi, [rdi + 2*rdx] 242 243 psubw xmm3, xmm2 244 movdqu xmm1, XMMWORD PTR [rsi] 245 paddw xmm5, xmm3 246 pmaddwd xmm3, xmm3 247 movdqu xmm2, XMMWORD PTR [rdi] 248 paddd xmm6, xmm3 249 250 psubw xmm1, xmm2 251 movdqu xmm3, XMMWORD PTR [rsi+rax] 252 paddw xmm5, xmm1 253 pmaddwd xmm1, xmm1 254 movdqu xmm2, XMMWORD PTR [rdi+rdx] 255 paddd xmm6, xmm1 256 257 psubw xmm3, xmm2 258 paddw xmm5, xmm3 259 pmaddwd xmm3, xmm3 260 paddd xmm6, xmm3 261 262 movdqa xmm1, xmm5 263 movdqa xmm2, xmm5 264 pcmpgtw xmm1, xmm0 265 pcmpeqw xmm2, xmm0 266 por xmm1, xmm2 267 pcmpeqw xmm1, xmm0 268 movdqa xmm2, xmm5 269 punpcklwd xmm5, xmm1 270 punpckhwd xmm2, xmm1 271 paddd xmm7, xmm5 272 paddd xmm7, xmm2 273 274 lea rsi, [rsi + 2*rax] 275 lea rdi, [rdi + 2*rdx] 276 sub rcx, 4 277 jnz .var8loop 278 279 movdqa xmm4, xmm6 280 punpckldq xmm6, xmm0 281 282 punpckhdq xmm4, xmm0 283 movdqa xmm5, xmm7 284 285 paddd xmm6, xmm4 286 punpckldq xmm7, xmm0 287 288 punpckhdq xmm5, xmm0 289 paddd xmm7, xmm5 290 291 movdqa xmm4, xmm6 292 movdqa xmm5, xmm7 293 294 psrldq xmm4, 8 295 psrldq xmm5, 8 296 297 paddd xmm6, xmm4 298 paddd xmm7, xmm5 299 300 mov rdi, arg(4) ; [SSE] 301 mov rax, arg(5) ; [Sum] 302 303 movd DWORD PTR [rdi], xmm6 304 movd DWORD PTR [rax], xmm7 305 306 ; begin epilog 307 pop rdi 308 pop rsi 309 pop rbx 310 RESTORE_XMM 311 UNSHADOW_ARGS 312 pop rbp 313 ret 314