1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "vpx_ports/x86_abi_support.asm" 12 13;void vpx_half_horiz_vert_variance16x_h_sse2(unsigned char *ref, 14; int ref_stride, 15; unsigned char *src, 16; int src_stride, 17; unsigned int height, 18; int *sum, 19; unsigned int *sumsquared) 20global sym(vpx_half_horiz_vert_variance16x_h_sse2) PRIVATE 21sym(vpx_half_horiz_vert_variance16x_h_sse2): 22 push rbp 23 mov rbp, rsp 24 SHADOW_ARGS_TO_STACK 7 25 SAVE_XMM 7 26 GET_GOT rbx 27 push rsi 28 push rdi 29 ; end prolog 30 31 pxor xmm6, xmm6 ; error accumulator 32 pxor xmm7, xmm7 ; sse eaccumulator 33 mov rsi, arg(0) ;ref 34 35 mov rdi, arg(2) ;src 36 movsxd rcx, dword ptr arg(4) ;height 37 movsxd rax, dword ptr arg(1) ;ref_stride 38 movsxd rdx, dword ptr arg(3) ;src_stride 39 40 pxor xmm0, xmm0 ; 41 42 movdqu xmm5, XMMWORD PTR [rsi] 43 movdqu xmm3, XMMWORD PTR [rsi+1] 44 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 45 46 lea rsi, [rsi + rax] 47 48vpx_half_horiz_vert_variance16x_h_1: 49 movdqu xmm1, XMMWORD PTR [rsi] ; 50 movdqu xmm2, XMMWORD PTR [rsi+1] ; 51 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 52 53 pavgb xmm5, xmm1 ; xmm = vertical average of the above 54 55 movdqa xmm4, xmm5 56 punpcklbw xmm5, xmm0 ; xmm5 = words of above 57 punpckhbw xmm4, xmm0 58 59 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 60 punpcklbw xmm3, xmm0 ; xmm3 = words of above 61 psubw xmm5, xmm3 ; xmm5 -= xmm3 62 63 movq xmm3, QWORD PTR [rdi+8] 64 punpcklbw xmm3, xmm0 65 psubw xmm4, xmm3 66 67 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 68 paddw xmm6, xmm4 69 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 70 pmaddwd xmm4, xmm4 71 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 72 paddd xmm7, xmm4 73 74 movdqa xmm5, xmm1 ; save xmm1 for use on the next row 75 76 lea rsi, [rsi + rax] 77 lea rdi, [rdi + rdx] 78 79 sub rcx, 1 ; 80 jnz vpx_half_horiz_vert_variance16x_h_1 ; 81 82 pxor xmm1, xmm1 83 pxor xmm5, xmm5 84 85 punpcklwd xmm0, xmm6 86 punpckhwd xmm1, xmm6 87 psrad xmm0, 16 88 psrad xmm1, 16 89 paddd xmm0, xmm1 90 movdqa xmm1, xmm0 91 92 movdqa xmm6, xmm7 93 punpckldq xmm6, xmm5 94 punpckhdq xmm7, xmm5 95 paddd xmm6, xmm7 96 97 punpckldq xmm0, xmm5 98 punpckhdq xmm1, xmm5 99 paddd xmm0, xmm1 100 101 movdqa xmm7, xmm6 102 movdqa xmm1, xmm0 103 104 psrldq xmm7, 8 105 psrldq xmm1, 8 106 107 paddd xmm6, xmm7 108 paddd xmm0, xmm1 109 110 mov rsi, arg(5) ;[Sum] 111 mov rdi, arg(6) ;[SSE] 112 113 movd [rsi], xmm0 114 movd [rdi], xmm6 115 116 ; begin epilog 117 pop rdi 118 pop rsi 119 RESTORE_GOT 120 RESTORE_XMM 121 UNSHADOW_ARGS 122 pop rbp 123 ret 124 125 126;void vpx_half_vert_variance16x_h_sse2(unsigned char *ref, 127; int ref_stride, 128; unsigned char *src, 129; int src_stride, 130; unsigned int height, 131; int *sum, 132; unsigned int *sumsquared) 133global sym(vpx_half_vert_variance16x_h_sse2) PRIVATE 134sym(vpx_half_vert_variance16x_h_sse2): 135 push rbp 136 mov rbp, rsp 137 SHADOW_ARGS_TO_STACK 7 138 SAVE_XMM 7 139 GET_GOT rbx 140 push rsi 141 push rdi 142 ; end prolog 143 144 pxor xmm6, xmm6 ; error accumulator 145 pxor xmm7, xmm7 ; sse eaccumulator 146 mov rsi, arg(0) ;ref 147 148 mov rdi, arg(2) ;src 149 movsxd rcx, dword ptr arg(4) ;height 150 movsxd rax, dword ptr arg(1) ;ref_stride 151 movsxd rdx, dword ptr arg(3) ;src_stride 152 153 movdqu xmm5, XMMWORD PTR [rsi] 154 lea rsi, [rsi + rax ] 155 pxor xmm0, xmm0 156 157vpx_half_vert_variance16x_h_1: 158 movdqu xmm3, XMMWORD PTR [rsi] 159 160 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 161 movdqa xmm4, xmm5 162 punpcklbw xmm5, xmm0 163 punpckhbw xmm4, xmm0 164 165 movq xmm2, QWORD PTR [rdi] 166 punpcklbw xmm2, xmm0 167 psubw xmm5, xmm2 168 movq xmm2, QWORD PTR [rdi+8] 169 punpcklbw xmm2, xmm0 170 psubw xmm4, xmm2 171 172 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 173 paddw xmm6, xmm4 174 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 175 pmaddwd xmm4, xmm4 176 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 177 paddd xmm7, xmm4 178 179 movdqa xmm5, xmm3 180 181 lea rsi, [rsi + rax] 182 lea rdi, [rdi + rdx] 183 184 sub rcx, 1 185 jnz vpx_half_vert_variance16x_h_1 186 187 pxor xmm1, xmm1 188 pxor xmm5, xmm5 189 190 punpcklwd xmm0, xmm6 191 punpckhwd xmm1, xmm6 192 psrad xmm0, 16 193 psrad xmm1, 16 194 paddd xmm0, xmm1 195 movdqa xmm1, xmm0 196 197 movdqa xmm6, xmm7 198 punpckldq xmm6, xmm5 199 punpckhdq xmm7, xmm5 200 paddd xmm6, xmm7 201 202 punpckldq xmm0, xmm5 203 punpckhdq xmm1, xmm5 204 paddd xmm0, xmm1 205 206 movdqa xmm7, xmm6 207 movdqa xmm1, xmm0 208 209 psrldq xmm7, 8 210 psrldq xmm1, 8 211 212 paddd xmm6, xmm7 213 paddd xmm0, xmm1 214 215 mov rsi, arg(5) ;[Sum] 216 mov rdi, arg(6) ;[SSE] 217 218 movd [rsi], xmm0 219 movd [rdi], xmm6 220 221 ; begin epilog 222 pop rdi 223 pop rsi 224 RESTORE_GOT 225 RESTORE_XMM 226 UNSHADOW_ARGS 227 pop rbp 228 ret 229 230 231;void vpx_half_horiz_variance16x_h_sse2(unsigned char *ref, 232; int ref_stride 233; unsigned char *src, 234; int src_stride, 235; unsigned int height, 236; int *sum, 237; unsigned int *sumsquared) 238global sym(vpx_half_horiz_variance16x_h_sse2) PRIVATE 239sym(vpx_half_horiz_variance16x_h_sse2): 240 push rbp 241 mov rbp, rsp 242 SHADOW_ARGS_TO_STACK 7 243 SAVE_XMM 7 244 GET_GOT rbx 245 push rsi 246 push rdi 247 ; end prolog 248 249 pxor xmm6, xmm6 ; error accumulator 250 pxor xmm7, xmm7 ; sse eaccumulator 251 mov rsi, arg(0) ;ref 252 253 mov rdi, arg(2) ;src 254 movsxd rcx, dword ptr arg(4) ;height 255 movsxd rax, dword ptr arg(1) ;ref_stride 256 movsxd rdx, dword ptr arg(3) ;src_stride 257 258 pxor xmm0, xmm0 ; 259 260vpx_half_horiz_variance16x_h_1: 261 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 262 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 263 264 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 265 movdqa xmm1, xmm5 266 punpcklbw xmm5, xmm0 ; xmm5 = words of above 267 punpckhbw xmm1, xmm0 268 269 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 270 punpcklbw xmm3, xmm0 ; xmm3 = words of above 271 movq xmm2, QWORD PTR [rdi+8] 272 punpcklbw xmm2, xmm0 273 274 psubw xmm5, xmm3 ; xmm5 -= xmm3 275 psubw xmm1, xmm2 276 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 277 paddw xmm6, xmm1 278 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 279 pmaddwd xmm1, xmm1 280 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 281 paddd xmm7, xmm1 282 283 lea rsi, [rsi + rax] 284 lea rdi, [rdi + rdx] 285 286 sub rcx, 1 ; 287 jnz vpx_half_horiz_variance16x_h_1 ; 288 289 pxor xmm1, xmm1 290 pxor xmm5, xmm5 291 292 punpcklwd xmm0, xmm6 293 punpckhwd xmm1, xmm6 294 psrad xmm0, 16 295 psrad xmm1, 16 296 paddd xmm0, xmm1 297 movdqa xmm1, xmm0 298 299 movdqa xmm6, xmm7 300 punpckldq xmm6, xmm5 301 punpckhdq xmm7, xmm5 302 paddd xmm6, xmm7 303 304 punpckldq xmm0, xmm5 305 punpckhdq xmm1, xmm5 306 paddd xmm0, xmm1 307 308 movdqa xmm7, xmm6 309 movdqa xmm1, xmm0 310 311 psrldq xmm7, 8 312 psrldq xmm1, 8 313 314 paddd xmm6, xmm7 315 paddd xmm0, xmm1 316 317 mov rsi, arg(5) ;[Sum] 318 mov rdi, arg(6) ;[SSE] 319 320 movd [rsi], xmm0 321 movd [rdi], xmm6 322 323 ; begin epilog 324 pop rdi 325 pop rsi 326 RESTORE_GOT 327 RESTORE_XMM 328 UNSHADOW_ARGS 329 pop rbp 330 ret 331 332SECTION_RODATA 333; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; 334align 16 335xmm_bi_rd: 336 times 8 dw 64 337align 16 338vpx_bilinear_filters_sse2: 339 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 340 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 341 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 342 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 343 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 344 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 345 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 346 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 347