1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14 15%include "aom_ports/x86_abi_support.asm" 16 17SECTION .text 18 19;unsigned int aom_highbd_calc16x16var_sse2 20;( 21; unsigned char * src_ptr, 22; int source_stride, 23; unsigned char * ref_ptr, 24; int recon_stride, 25; unsigned int * SSE, 26; int * Sum 27;) 28globalsym(aom_highbd_calc16x16var_sse2) 29sym(aom_highbd_calc16x16var_sse2): 30 push rbp 31 mov rbp, rsp 32 SHADOW_ARGS_TO_STACK 6 33 SAVE_XMM 7 34 push rbx 35 push rsi 36 push rdi 37 ; end prolog 38 39 mov rsi, arg(0) ;[src_ptr] 40 mov rdi, arg(2) ;[ref_ptr] 41 42 movsxd rax, DWORD PTR arg(1) ;[source_stride] 43 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 44 add rax, rax ; source stride in bytes 45 add rdx, rdx ; recon stride in bytes 46 47 ; Prefetch data 48 prefetcht0 [rsi] 49 prefetcht0 [rsi+16] 50 prefetcht0 [rsi+rax] 51 prefetcht0 [rsi+rax+16] 52 lea rbx, [rsi+rax*2] 53 prefetcht0 [rbx] 54 prefetcht0 [rbx+16] 55 prefetcht0 [rbx+rax] 56 prefetcht0 [rbx+rax+16] 57 58 prefetcht0 [rdi] 59 prefetcht0 [rdi+16] 60 prefetcht0 [rdi+rdx] 61 prefetcht0 [rdi+rdx+16] 62 lea rbx, [rdi+rdx*2] 63 prefetcht0 [rbx] 64 prefetcht0 [rbx+16] 65 prefetcht0 [rbx+rdx] 66 prefetcht0 [rbx+rdx+16] 67 68 pxor xmm0, xmm0 ; clear xmm0 for unpack 69 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 70 71 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse 72 mov rcx, 16 73 74.var16loop: 75 movdqu xmm1, XMMWORD PTR [rsi] 76 movdqu xmm2, XMMWORD PTR [rdi] 77 78 lea rbx, [rsi+rax*2] 79 prefetcht0 [rbx] 80 prefetcht0 [rbx+16] 81 prefetcht0 [rbx+rax] 82 prefetcht0 [rbx+rax+16] 83 lea rbx, [rdi+rdx*2] 84 prefetcht0 [rbx] 85 prefetcht0 [rbx+16] 86 prefetcht0 [rbx+rdx] 87 prefetcht0 [rbx+rdx+16] 88 89 pxor xmm5, xmm5 90 91 psubw xmm1, xmm2 92 movdqu xmm3, XMMWORD PTR [rsi+16] 93 paddw xmm5, xmm1 94 pmaddwd xmm1, xmm1 95 movdqu xmm2, XMMWORD PTR [rdi+16] 96 paddd xmm6, xmm1 97 98 psubw xmm3, xmm2 99 movdqu xmm1, XMMWORD PTR [rsi+rax] 100 paddw xmm5, xmm3 101 pmaddwd xmm3, xmm3 102 movdqu xmm2, XMMWORD PTR [rdi+rdx] 103 paddd xmm6, xmm3 104 105 psubw xmm1, xmm2 106 movdqu xmm3, XMMWORD PTR [rsi+rax+16] 107 paddw xmm5, xmm1 108 pmaddwd xmm1, xmm1 109 movdqu xmm2, XMMWORD PTR [rdi+rdx+16] 110 paddd xmm6, xmm1 111 112 psubw xmm3, xmm2 113 paddw xmm5, xmm3 114 pmaddwd xmm3, xmm3 115 paddd xmm6, xmm3 116 117 movdqa xmm1, xmm5 118 movdqa xmm2, xmm5 119 pcmpgtw xmm1, xmm0 120 pcmpeqw xmm2, xmm0 121 por xmm1, xmm2 122 pcmpeqw xmm1, xmm0 123 movdqa xmm2, xmm5 124 punpcklwd xmm5, xmm1 125 punpckhwd xmm2, xmm1 126 paddd xmm7, xmm5 127 paddd xmm7, xmm2 128 129 lea rsi, [rsi + 2*rax] 130 lea rdi, [rdi + 2*rdx] 131 sub rcx, 2 132 jnz .var16loop 133 134 movdqa xmm4, xmm6 135 punpckldq xmm6, xmm0 136 137 punpckhdq xmm4, xmm0 138 movdqa xmm5, xmm7 139 140 paddd xmm6, xmm4 141 punpckldq xmm7, xmm0 142 143 punpckhdq xmm5, xmm0 144 paddd xmm7, xmm5 145 146 movdqa xmm4, xmm6 147 movdqa xmm5, xmm7 148 149 psrldq xmm4, 8 150 psrldq xmm5, 8 151 152 paddd xmm6, xmm4 153 paddd xmm7, xmm5 154 155 mov rdi, arg(4) ; [SSE] 156 mov rax, arg(5) ; [Sum] 157 158 movd DWORD PTR [rdi], xmm6 159 movd DWORD PTR [rax], xmm7 160 161 162 ; begin epilog 163 pop rdi 164 pop rsi 165 pop rbx 166 RESTORE_XMM 167 UNSHADOW_ARGS 168 pop rbp 169 ret 170 171 172;unsigned int aom_highbd_calc8x8var_sse2 173;( 174; unsigned char * src_ptr, 175; int source_stride, 176; unsigned char * ref_ptr, 177; int recon_stride, 178; unsigned int * SSE, 179; int * Sum 180;) 181globalsym(aom_highbd_calc8x8var_sse2) 182sym(aom_highbd_calc8x8var_sse2): 183 push rbp 184 mov rbp, rsp 185 SHADOW_ARGS_TO_STACK 6 186 SAVE_XMM 7 187 push rbx 188 push rsi 189 push rdi 190 ; end prolog 191 192 mov rsi, arg(0) ;[src_ptr] 193 mov rdi, arg(2) ;[ref_ptr] 194 195 movsxd rax, DWORD PTR arg(1) ;[source_stride] 196 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 197 add rax, rax ; source stride in bytes 198 add rdx, rdx ; recon stride in bytes 199 200 ; Prefetch data 201 prefetcht0 [rsi] 202 prefetcht0 [rsi+rax] 203 lea rbx, [rsi+rax*2] 204 prefetcht0 [rbx] 205 prefetcht0 [rbx+rax] 206 207 prefetcht0 [rdi] 208 prefetcht0 [rdi+rdx] 209 lea rbx, [rdi+rdx*2] 210 prefetcht0 [rbx] 211 prefetcht0 [rbx+rdx] 212 213 pxor xmm0, xmm0 ; clear xmm0 for unpack 214 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 215 216 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse 217 mov rcx, 8 218 219.var8loop: 220 movdqu xmm1, XMMWORD PTR [rsi] 221 movdqu xmm2, XMMWORD PTR [rdi] 222 223 lea rbx, [rsi+rax*4] 224 prefetcht0 [rbx] 225 prefetcht0 [rbx+rax] 226 lea rbx, [rbx+rax*2] 227 prefetcht0 [rbx] 228 prefetcht0 [rbx+rax] 229 lea rbx, [rdi+rdx*4] 230 prefetcht0 [rbx] 231 prefetcht0 [rbx+rdx] 232 lea rbx, [rbx+rdx*2] 233 prefetcht0 [rbx] 234 prefetcht0 [rbx+rdx] 235 236 pxor xmm5, xmm5 237 238 psubw xmm1, xmm2 239 movdqu xmm3, XMMWORD PTR [rsi+rax] 240 paddw xmm5, xmm1 241 pmaddwd xmm1, xmm1 242 movdqu xmm2, XMMWORD PTR [rdi+rdx] 243 paddd xmm6, xmm1 244 245 lea rsi, [rsi + 2*rax] 246 lea rdi, [rdi + 2*rdx] 247 248 psubw xmm3, xmm2 249 movdqu xmm1, XMMWORD PTR [rsi] 250 paddw xmm5, xmm3 251 pmaddwd xmm3, xmm3 252 movdqu xmm2, XMMWORD PTR [rdi] 253 paddd xmm6, xmm3 254 255 psubw xmm1, xmm2 256 movdqu xmm3, XMMWORD PTR [rsi+rax] 257 paddw xmm5, xmm1 258 pmaddwd xmm1, xmm1 259 movdqu xmm2, XMMWORD PTR [rdi+rdx] 260 paddd xmm6, xmm1 261 262 psubw xmm3, xmm2 263 paddw xmm5, xmm3 264 pmaddwd xmm3, xmm3 265 paddd xmm6, xmm3 266 267 movdqa xmm1, xmm5 268 movdqa xmm2, xmm5 269 pcmpgtw xmm1, xmm0 270 pcmpeqw xmm2, xmm0 271 por xmm1, xmm2 272 pcmpeqw xmm1, xmm0 273 movdqa xmm2, xmm5 274 punpcklwd xmm5, xmm1 275 punpckhwd xmm2, xmm1 276 paddd xmm7, xmm5 277 paddd xmm7, xmm2 278 279 lea rsi, [rsi + 2*rax] 280 lea rdi, [rdi + 2*rdx] 281 sub rcx, 4 282 jnz .var8loop 283 284 movdqa xmm4, xmm6 285 punpckldq xmm6, xmm0 286 287 punpckhdq xmm4, xmm0 288 movdqa xmm5, xmm7 289 290 paddd xmm6, xmm4 291 punpckldq xmm7, xmm0 292 293 punpckhdq xmm5, xmm0 294 paddd xmm7, xmm5 295 296 movdqa xmm4, xmm6 297 movdqa xmm5, xmm7 298 299 psrldq xmm4, 8 300 psrldq xmm5, 8 301 302 paddd xmm6, xmm4 303 paddd xmm7, xmm5 304 305 mov rdi, arg(4) ; [SSE] 306 mov rax, arg(5) ; [Sum] 307 308 movd DWORD PTR [rdi], xmm6 309 movd DWORD PTR [rax], xmm7 310 311 ; begin epilog 312 pop rdi 313 pop rsi 314 pop rbx 315 RESTORE_XMM 316 UNSHADOW_ARGS 317 pop rbp 318 ret 319