1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "aom_ports/x86_abi_support.asm" 15 16; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr 17%macro TABULATE_SSIM 0 18 paddusw xmm15, xmm3 ; sum_s 19 paddusw xmm14, xmm4 ; sum_r 20 movdqa xmm1, xmm3 21 pmaddwd xmm1, xmm1 22 paddd xmm13, xmm1 ; sum_sq_s 23 movdqa xmm2, xmm4 24 pmaddwd xmm2, xmm2 25 paddd xmm12, xmm2 ; sum_sq_r 26 pmaddwd xmm3, xmm4 27 paddd xmm11, xmm3 ; sum_sxr 28%endmacro 29 30; Sum across the register %1 starting with q words 31%macro SUM_ACROSS_Q 1 32 movdqa xmm2,%1 33 punpckldq %1,xmm0 34 punpckhdq xmm2,xmm0 35 paddq %1,xmm2 36 movdqa xmm2,%1 37 punpcklqdq %1,xmm0 38 punpckhqdq xmm2,xmm0 39 paddq %1,xmm2 40%endmacro 41 42; Sum across the register %1 starting with q words 43%macro SUM_ACROSS_W 1 44 movdqa xmm1, %1 45 punpcklwd %1,xmm0 46 punpckhwd xmm1,xmm0 47 paddd %1, xmm1 48 SUM_ACROSS_Q %1 49%endmacro 50 51SECTION .text 52 53;void ssim_parms_sse2( 54; unsigned char *s, 55; int sp, 56; unsigned char *r, 57; int rp 58; uint32_t *sum_s, 59; uint32_t *sum_r, 60; uint32_t *sum_sq_s, 61; uint32_t *sum_sq_r, 62; uint32_t *sum_sxr); 63; 64; TODO: Use parm passing through structure, probably don't need the pxors 65; ( calling app will initialize to 0 ) could easily fit everything in sse2 66; without too much hastle, and can probably do better estimates with psadw 67; or pavgb At this point this is just meant to be first pass for calculating 68; all the parms needed for 16x16 ssim so we can play with dssim as distortion 69; in mode selection code. 70globalsym(aom_ssim_parms_16x16_sse2) 71sym(aom_ssim_parms_16x16_sse2): 72 push rbp 73 mov rbp, rsp 74 SHADOW_ARGS_TO_STACK 9 75 SAVE_XMM 15 76 push rsi 77 push rdi 78 ; end prolog 79 80 mov rsi, arg(0) ;s 81 mov rcx, arg(1) ;sp 82 mov rdi, arg(2) ;r 83 mov rax, arg(3) ;rp 84 85 pxor xmm0, xmm0 86 pxor xmm15,xmm15 ;sum_s 87 pxor xmm14,xmm14 ;sum_r 88 pxor xmm13,xmm13 ;sum_sq_s 89 pxor xmm12,xmm12 ;sum_sq_r 90 pxor xmm11,xmm11 ;sum_sxr 91 92 mov rdx, 16 ;row counter 93.NextRow: 94 95 ;grab source and reference pixels 96 movdqu xmm5, [rsi] 97 movdqu xmm6, [rdi] 98 movdqa xmm3, xmm5 99 movdqa xmm4, xmm6 100 punpckhbw xmm3, xmm0 ; high_s 101 punpckhbw xmm4, xmm0 ; high_r 102 103 TABULATE_SSIM 104 105 movdqa xmm3, xmm5 106 movdqa xmm4, xmm6 107 punpcklbw xmm3, xmm0 ; low_s 108 punpcklbw xmm4, xmm0 ; low_r 109 110 TABULATE_SSIM 111 112 add rsi, rcx ; next s row 113 add rdi, rax ; next r row 114 115 dec rdx ; counter 116 jnz .NextRow 117 118 SUM_ACROSS_W xmm15 119 SUM_ACROSS_W xmm14 120 SUM_ACROSS_Q xmm13 121 SUM_ACROSS_Q xmm12 122 SUM_ACROSS_Q xmm11 123 124 mov rdi,arg(4) 125 movd [rdi], xmm15; 126 mov rdi,arg(5) 127 movd [rdi], xmm14; 128 mov rdi,arg(6) 129 movd [rdi], xmm13; 130 mov rdi,arg(7) 131 movd [rdi], xmm12; 132 mov rdi,arg(8) 133 movd [rdi], xmm11; 134 135 ; begin epilog 136 pop rdi 137 pop rsi 138 RESTORE_XMM 139 UNSHADOW_ARGS 140 pop rbp 141 ret 142 143;void ssim_parms_sse2( 144; unsigned char *s, 145; int sp, 146; unsigned char *r, 147; int rp 148; uint32_t *sum_s, 149; uint32_t *sum_r, 150; uint32_t *sum_sq_s, 151; uint32_t *sum_sq_r, 152; uint32_t *sum_sxr); 153; 154; TODO: Use parm passing through structure, probably don't need the pxors 155; ( calling app will initialize to 0 ) could easily fit everything in sse2 156; without too much hastle, and can probably do better estimates with psadw 157; or pavgb At this point this is just meant to be first pass for calculating 158; all the parms needed for 16x16 ssim so we can play with dssim as distortion 159; in mode selection code. 160globalsym(aom_ssim_parms_8x8_sse2) 161sym(aom_ssim_parms_8x8_sse2): 162 push rbp 163 mov rbp, rsp 164 SHADOW_ARGS_TO_STACK 9 165 SAVE_XMM 15 166 push rsi 167 push rdi 168 ; end prolog 169 170 mov rsi, arg(0) ;s 171 mov rcx, arg(1) ;sp 172 mov rdi, arg(2) ;r 173 mov rax, arg(3) ;rp 174 175 pxor xmm0, xmm0 176 pxor xmm15,xmm15 ;sum_s 177 pxor xmm14,xmm14 ;sum_r 178 pxor xmm13,xmm13 ;sum_sq_s 179 pxor xmm12,xmm12 ;sum_sq_r 180 pxor xmm11,xmm11 ;sum_sxr 181 182 mov rdx, 8 ;row counter 183.NextRow: 184 185 ;grab source and reference pixels 186 movq xmm3, [rsi] 187 movq xmm4, [rdi] 188 punpcklbw xmm3, xmm0 ; low_s 189 punpcklbw xmm4, xmm0 ; low_r 190 191 TABULATE_SSIM 192 193 add rsi, rcx ; next s row 194 add rdi, rax ; next r row 195 196 dec rdx ; counter 197 jnz .NextRow 198 199 SUM_ACROSS_W xmm15 200 SUM_ACROSS_W xmm14 201 SUM_ACROSS_Q xmm13 202 SUM_ACROSS_Q xmm12 203 SUM_ACROSS_Q xmm11 204 205 mov rdi,arg(4) 206 movd [rdi], xmm15; 207 mov rdi,arg(5) 208 movd [rdi], xmm14; 209 mov rdi,arg(6) 210 movd [rdi], xmm13; 211 mov rdi,arg(7) 212 movd [rdi], xmm12; 213 mov rdi,arg(8) 214 movd [rdi], xmm11; 215 216 ; begin epilog 217 pop rdi 218 pop rsi 219 RESTORE_XMM 220 UNSHADOW_ARGS 221 pop rbp 222 ret 223