1; 2; Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14SECTION .text 15 16;void vp8_filter_by_weight16x16_sse2 17;( 18; unsigned char *src, 19; int src_stride, 20; unsigned char *dst, 21; int dst_stride, 22; int src_weight 23;) 24globalsym(vp8_filter_by_weight16x16_sse2) 25sym(vp8_filter_by_weight16x16_sse2): 26 push rbp 27 mov rbp, rsp 28 SHADOW_ARGS_TO_STACK 5 29 SAVE_XMM 6 30 GET_GOT rbx 31 push rsi 32 push rdi 33 ; end prolog 34 35 movd xmm0, arg(4) ; src_weight 36 pshuflw xmm0, xmm0, 0x0 ; replicate to all low words 37 punpcklqdq xmm0, xmm0 ; replicate to all hi words 38 39 movdqa xmm1, [GLOBAL(tMFQE)] 40 psubw xmm1, xmm0 ; dst_weight 41 42 mov rax, arg(0) ; src 43 mov rsi, arg(1) ; src_stride 44 mov rdx, arg(2) ; dst 45 mov rdi, arg(3) ; dst_stride 46 47 mov rcx, 16 ; loop count 48 pxor xmm6, xmm6 49 50.combine: 51 movdqa xmm2, [rax] 52 movdqa xmm4, [rdx] 53 add rax, rsi 54 55 ; src * src_weight 56 movdqa xmm3, xmm2 57 punpcklbw xmm2, xmm6 58 punpckhbw xmm3, xmm6 59 pmullw xmm2, xmm0 60 pmullw xmm3, xmm0 61 62 ; dst * dst_weight 63 movdqa xmm5, xmm4 64 punpcklbw xmm4, xmm6 65 punpckhbw xmm5, xmm6 66 pmullw xmm4, xmm1 67 pmullw xmm5, xmm1 68 69 ; sum, round and shift 70 paddw xmm2, xmm4 71 paddw xmm3, xmm5 72 paddw xmm2, [GLOBAL(tMFQE_round)] 73 paddw xmm3, [GLOBAL(tMFQE_round)] 74 psrlw xmm2, 4 75 psrlw xmm3, 4 76 77 packuswb xmm2, xmm3 78 movdqa [rdx], xmm2 79 add rdx, rdi 80 81 dec rcx 82 jnz .combine 83 84 ; begin epilog 85 pop rdi 86 pop rsi 87 RESTORE_GOT 88 RESTORE_XMM 89 UNSHADOW_ARGS 90 pop rbp 91 92 ret 93 94;void vp8_filter_by_weight8x8_sse2 95;( 96; unsigned char *src, 97; int src_stride, 98; unsigned char *dst, 99; int dst_stride, 100; int src_weight 101;) 102globalsym(vp8_filter_by_weight8x8_sse2) 103sym(vp8_filter_by_weight8x8_sse2): 104 push rbp 105 mov rbp, rsp 106 SHADOW_ARGS_TO_STACK 5 107 GET_GOT rbx 108 push rsi 109 push rdi 110 ; end prolog 111 112 movd xmm0, arg(4) ; src_weight 113 pshuflw xmm0, xmm0, 0x0 ; replicate to all low words 114 punpcklqdq xmm0, xmm0 ; replicate to all hi words 115 116 movdqa xmm1, [GLOBAL(tMFQE)] 117 psubw xmm1, xmm0 ; dst_weight 118 119 mov rax, arg(0) ; src 120 mov rsi, arg(1) ; src_stride 121 mov rdx, arg(2) ; dst 122 mov rdi, arg(3) ; dst_stride 123 124 mov rcx, 8 ; loop count 125 pxor xmm4, xmm4 126 127.combine: 128 movq xmm2, [rax] 129 movq xmm3, [rdx] 130 add rax, rsi 131 132 ; src * src_weight 133 punpcklbw xmm2, xmm4 134 pmullw xmm2, xmm0 135 136 ; dst * dst_weight 137 punpcklbw xmm3, xmm4 138 pmullw xmm3, xmm1 139 140 ; sum, round and shift 141 paddw xmm2, xmm3 142 paddw xmm2, [GLOBAL(tMFQE_round)] 143 psrlw xmm2, 4 144 145 packuswb xmm2, xmm4 146 movq [rdx], xmm2 147 add rdx, rdi 148 149 dec rcx 150 jnz .combine 151 152 ; begin epilog 153 pop rdi 154 pop rsi 155 RESTORE_GOT 156 UNSHADOW_ARGS 157 pop rbp 158 159 ret 160 161;void vp8_variance_and_sad_16x16_sse2 | arg 162;( 163; unsigned char *src1, 0 164; int stride1, 1 165; unsigned char *src2, 2 166; int stride2, 3 167; unsigned int *variance, 4 168; unsigned int *sad, 5 169;) 170globalsym(vp8_variance_and_sad_16x16_sse2) 171sym(vp8_variance_and_sad_16x16_sse2): 172 push rbp 173 mov rbp, rsp 174 SHADOW_ARGS_TO_STACK 6 175 GET_GOT rbx 176 push rsi 177 push rdi 178 ; end prolog 179 180 mov rax, arg(0) ; src1 181 mov rcx, arg(1) ; stride1 182 mov rdx, arg(2) ; src2 183 mov rdi, arg(3) ; stride2 184 185 mov rsi, 16 ; block height 186 187 ; Prep accumulator registers 188 pxor xmm3, xmm3 ; SAD 189 pxor xmm4, xmm4 ; sum of src2 190 pxor xmm5, xmm5 ; sum of src2^2 191 192 ; Because we're working with the actual output frames 193 ; we can't depend on any kind of data alignment. 194.accumulate: 195 movdqa xmm0, [rax] ; src1 196 movdqa xmm1, [rdx] ; src2 197 add rax, rcx ; src1 + stride1 198 add rdx, rdi ; src2 + stride2 199 200 ; SAD(src1, src2) 201 psadbw xmm0, xmm1 202 paddusw xmm3, xmm0 203 204 ; SUM(src2) 205 pxor xmm2, xmm2 206 psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0 207 paddusw xmm4, xmm2 208 209 ; pmaddubsw would be ideal if it took two unsigned values. instead, 210 ; it expects a signed and an unsigned value. so instead we zero extend 211 ; and operate on words. 212 pxor xmm2, xmm2 213 movdqa xmm0, xmm1 214 punpcklbw xmm0, xmm2 215 punpckhbw xmm1, xmm2 216 pmaddwd xmm0, xmm0 217 pmaddwd xmm1, xmm1 218 paddd xmm5, xmm0 219 paddd xmm5, xmm1 220 221 sub rsi, 1 222 jnz .accumulate 223 224 ; phaddd only operates on adjacent double words. 225 ; Finalize SAD and store 226 movdqa xmm0, xmm3 227 psrldq xmm0, 8 228 paddusw xmm0, xmm3 229 paddd xmm0, [GLOBAL(t128)] 230 psrld xmm0, 8 231 232 mov rax, arg(5) 233 movd [rax], xmm0 234 235 ; Accumulate sum of src2 236 movdqa xmm0, xmm4 237 psrldq xmm0, 8 238 paddusw xmm0, xmm4 239 ; Square src2. Ignore high value 240 pmuludq xmm0, xmm0 241 psrld xmm0, 8 242 243 ; phaddw could be used to sum adjacent values but we want 244 ; all the values summed. promote to doubles, accumulate, 245 ; shift and sum 246 pxor xmm2, xmm2 247 movdqa xmm1, xmm5 248 punpckldq xmm1, xmm2 249 punpckhdq xmm5, xmm2 250 paddd xmm1, xmm5 251 movdqa xmm2, xmm1 252 psrldq xmm1, 8 253 paddd xmm1, xmm2 254 255 psubd xmm1, xmm0 256 257 ; (variance + 128) >> 8 258 paddd xmm1, [GLOBAL(t128)] 259 psrld xmm1, 8 260 mov rax, arg(4) 261 262 movd [rax], xmm1 263 264 265 ; begin epilog 266 pop rdi 267 pop rsi 268 RESTORE_GOT 269 UNSHADOW_ARGS 270 pop rbp 271 ret 272 273SECTION_RODATA 274align 16 275t128: 276%ifndef __NASM_VER__ 277 ddq 128 278%elif CONFIG_BIG_ENDIAN 279 dq 0, 128 280%else 281 dq 128, 0 282%endif 283align 16 284tMFQE: ; 1 << MFQE_PRECISION 285 times 8 dw 0x10 286align 16 287tMFQE_round: ; 1 << (MFQE_PRECISION - 1) 288 times 8 dw 0x08 289 290