1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "vpx_ports/x86_abi_support.asm" 12 13; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr 14%macro TABULATE_SSIM 0 15 paddusw xmm15, xmm3 ; sum_s 16 paddusw xmm14, xmm4 ; sum_r 17 movdqa xmm1, xmm3 18 pmaddwd xmm1, xmm1 19 paddd xmm13, xmm1 ; sum_sq_s 20 movdqa xmm2, xmm4 21 pmaddwd xmm2, xmm2 22 paddd xmm12, xmm2 ; sum_sq_r 23 pmaddwd xmm3, xmm4 24 paddd xmm11, xmm3 ; sum_sxr 25%endmacro 26 27; Sum across the register %1 starting with q words 28%macro SUM_ACROSS_Q 1 29 movdqa xmm2,%1 30 punpckldq %1,xmm0 31 punpckhdq xmm2,xmm0 32 paddq %1,xmm2 33 movdqa xmm2,%1 34 punpcklqdq %1,xmm0 35 punpckhqdq xmm2,xmm0 36 paddq %1,xmm2 37%endmacro 38 39; Sum across the register %1 starting with q words 40%macro SUM_ACROSS_W 1 41 movdqa xmm1, %1 42 punpcklwd %1,xmm0 43 punpckhwd xmm1,xmm0 44 paddd %1, xmm1 45 SUM_ACROSS_Q %1 46%endmacro 47 48SECTION .text 49 50;void ssim_parms_sse2( 51; unsigned char *s, 52; int sp, 53; unsigned char *r, 54; int rp 55; uint32_t *sum_s, 56; uint32_t *sum_r, 57; uint32_t *sum_sq_s, 58; uint32_t *sum_sq_r, 59; uint32_t *sum_sxr); 60; 61; TODO: Use parm passing through structure, probably don't need the pxors 62; ( calling app will initialize to 0 ) could easily fit everything in sse2 63; without too much hastle, and can probably do better estimates with psadw 64; or pavgb At this point this is just meant to be first pass for calculating 65; all the parms needed for 16x16 ssim so we can play with dssim as distortion 66; in mode selection code. 67globalsym(vpx_ssim_parms_16x16_sse2) 68sym(vpx_ssim_parms_16x16_sse2): 69 push rbp 70 mov rbp, rsp 71 SHADOW_ARGS_TO_STACK 9 72 SAVE_XMM 15 73 push rsi 74 push rdi 75 ; end prolog 76 77 mov rsi, arg(0) ;s 78 mov rcx, arg(1) ;sp 79 mov rdi, arg(2) ;r 80 mov rax, arg(3) ;rp 81 82 pxor xmm0, xmm0 83 pxor xmm15,xmm15 ;sum_s 84 pxor xmm14,xmm14 ;sum_r 85 pxor xmm13,xmm13 ;sum_sq_s 86 pxor xmm12,xmm12 ;sum_sq_r 87 pxor xmm11,xmm11 ;sum_sxr 88 89 mov rdx, 16 ;row counter 90.NextRow: 91 92 ;grab source and reference pixels 93 movdqu xmm5, [rsi] 94 movdqu xmm6, [rdi] 95 movdqa xmm3, xmm5 96 movdqa xmm4, xmm6 97 punpckhbw xmm3, xmm0 ; high_s 98 punpckhbw xmm4, xmm0 ; high_r 99 100 TABULATE_SSIM 101 102 movdqa xmm3, xmm5 103 movdqa xmm4, xmm6 104 punpcklbw xmm3, xmm0 ; low_s 105 punpcklbw xmm4, xmm0 ; low_r 106 107 TABULATE_SSIM 108 109 add rsi, rcx ; next s row 110 add rdi, rax ; next r row 111 112 dec rdx ; counter 113 jnz .NextRow 114 115 SUM_ACROSS_W xmm15 116 SUM_ACROSS_W xmm14 117 SUM_ACROSS_Q xmm13 118 SUM_ACROSS_Q xmm12 119 SUM_ACROSS_Q xmm11 120 121 mov rdi,arg(4) 122 movd [rdi], xmm15; 123 mov rdi,arg(5) 124 movd [rdi], xmm14; 125 mov rdi,arg(6) 126 movd [rdi], xmm13; 127 mov rdi,arg(7) 128 movd [rdi], xmm12; 129 mov rdi,arg(8) 130 movd [rdi], xmm11; 131 132 ; begin epilog 133 pop rdi 134 pop rsi 135 RESTORE_XMM 136 UNSHADOW_ARGS 137 pop rbp 138 ret 139 140;void ssim_parms_sse2( 141; unsigned char *s, 142; int sp, 143; unsigned char *r, 144; int rp 145; uint32_t *sum_s, 146; uint32_t *sum_r, 147; uint32_t *sum_sq_s, 148; uint32_t *sum_sq_r, 149; uint32_t *sum_sxr); 150; 151; TODO: Use parm passing through structure, probably don't need the pxors 152; ( calling app will initialize to 0 ) could easily fit everything in sse2 153; without too much hastle, and can probably do better estimates with psadw 154; or pavgb At this point this is just meant to be first pass for calculating 155; all the parms needed for 16x16 ssim so we can play with dssim as distortion 156; in mode selection code. 157globalsym(vpx_ssim_parms_8x8_sse2) 158sym(vpx_ssim_parms_8x8_sse2): 159 push rbp 160 mov rbp, rsp 161 SHADOW_ARGS_TO_STACK 9 162 SAVE_XMM 15 163 push rsi 164 push rdi 165 ; end prolog 166 167 mov rsi, arg(0) ;s 168 mov rcx, arg(1) ;sp 169 mov rdi, arg(2) ;r 170 mov rax, arg(3) ;rp 171 172 pxor xmm0, xmm0 173 pxor xmm15,xmm15 ;sum_s 174 pxor xmm14,xmm14 ;sum_r 175 pxor xmm13,xmm13 ;sum_sq_s 176 pxor xmm12,xmm12 ;sum_sq_r 177 pxor xmm11,xmm11 ;sum_sxr 178 179 mov rdx, 8 ;row counter 180.NextRow: 181 182 ;grab source and reference pixels 183 movq xmm3, [rsi] 184 movq xmm4, [rdi] 185 punpcklbw xmm3, xmm0 ; low_s 186 punpcklbw xmm4, xmm0 ; low_r 187 188 TABULATE_SSIM 189 190 add rsi, rcx ; next s row 191 add rdi, rax ; next r row 192 193 dec rdx ; counter 194 jnz .NextRow 195 196 SUM_ACROSS_W xmm15 197 SUM_ACROSS_W xmm14 198 SUM_ACROSS_Q xmm13 199 SUM_ACROSS_Q xmm12 200 SUM_ACROSS_Q xmm11 201 202 mov rdi,arg(4) 203 movd [rdi], xmm15; 204 mov rdi,arg(5) 205 movd [rdi], xmm14; 206 mov rdi,arg(6) 207 movd [rdi], xmm13; 208 mov rdi,arg(7) 209 movd [rdi], xmm12; 210 mov rdi,arg(8) 211 movd [rdi], xmm11; 212 213 ; begin epilog 214 pop rdi 215 pop rsi 216 RESTORE_XMM 217 UNSHADOW_ARGS 218 pop rbp 219 ret 220