1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "aom_ports/x86_abi_support.asm" 15 16%macro GET_PARAM_4 0 17 mov rdx, arg(5) ;filter ptr 18 mov rsi, arg(0) ;src_ptr 19 mov rdi, arg(2) ;output_ptr 20 mov rcx, 0x0400040 21 22 movdqa xmm3, [rdx] ;load filters 23 pshuflw xmm4, xmm3, 11111111b ;k3 24 psrldq xmm3, 8 25 pshuflw xmm3, xmm3, 0b ;k4 26 punpcklqdq xmm4, xmm3 ;k3k4 27 28 movq xmm3, rcx ;rounding 29 pshufd xmm3, xmm3, 0 30 31 pxor xmm2, xmm2 32 33 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 34 movsxd rdx, DWORD PTR arg(3) ;out_pitch 35 movsxd rcx, DWORD PTR arg(4) ;output_height 36%endm 37 38%macro APPLY_FILTER_4 1 39 40 punpckldq xmm0, xmm1 ;two row in one register 41 punpcklbw xmm0, xmm2 ;unpack to word 42 pmullw xmm0, xmm4 ;multiply the filter factors 43 44 movdqa xmm1, xmm0 45 psrldq xmm1, 8 46 paddsw xmm0, xmm1 47 48 paddsw xmm0, xmm3 ;rounding 49 psraw xmm0, 7 ;shift 50 packuswb xmm0, xmm0 ;pack to byte 51 52%if %1 53 movd xmm1, [rdi] 54 pavgb xmm0, xmm1 55%endif 56 57 movd [rdi], xmm0 58 lea rsi, [rsi + rax] 59 lea rdi, [rdi + rdx] 60 dec rcx 61%endm 62 63%macro GET_PARAM 0 64 mov rdx, arg(5) ;filter ptr 65 mov rsi, arg(0) ;src_ptr 66 mov rdi, arg(2) ;output_ptr 67 mov rcx, 0x0400040 68 69 movdqa xmm7, [rdx] ;load filters 70 71 pshuflw xmm6, xmm7, 11111111b ;k3 72 pshufhw xmm7, xmm7, 0b ;k4 73 punpcklwd xmm6, xmm6 74 punpckhwd xmm7, xmm7 75 76 movq xmm4, rcx ;rounding 77 pshufd xmm4, xmm4, 0 78 79 pxor xmm5, xmm5 80 81 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 82 movsxd rdx, DWORD PTR arg(3) ;out_pitch 83 movsxd rcx, DWORD PTR arg(4) ;output_height 84%endm 85 86%macro APPLY_FILTER_8 1 87 punpcklbw xmm0, xmm5 88 punpcklbw xmm1, xmm5 89 90 pmullw xmm0, xmm6 91 pmullw xmm1, xmm7 92 paddsw xmm0, xmm1 93 paddsw xmm0, xmm4 ;rounding 94 psraw xmm0, 7 ;shift 95 packuswb xmm0, xmm0 ;pack back to byte 96%if %1 97 movq xmm1, [rdi] 98 pavgb xmm0, xmm1 99%endif 100 movq [rdi], xmm0 ;store the result 101 102 lea rsi, [rsi + rax] 103 lea rdi, [rdi + rdx] 104 dec rcx 105%endm 106 107%macro APPLY_FILTER_16 1 108 punpcklbw xmm0, xmm5 109 punpcklbw xmm1, xmm5 110 punpckhbw xmm2, xmm5 111 punpckhbw xmm3, xmm5 112 113 pmullw xmm0, xmm6 114 pmullw xmm1, xmm7 115 pmullw xmm2, xmm6 116 pmullw xmm3, xmm7 117 118 paddsw xmm0, xmm1 119 paddsw xmm2, xmm3 120 121 paddsw xmm0, xmm4 ;rounding 122 paddsw xmm2, xmm4 123 psraw xmm0, 7 ;shift 124 psraw xmm2, 7 125 packuswb xmm0, xmm2 ;pack back to byte 126%if %1 127 movdqu xmm1, [rdi] 128 pavgb xmm0, xmm1 129%endif 130 movdqu [rdi], xmm0 ;store the result 131 132 lea rsi, [rsi + rax] 133 lea rdi, [rdi + rdx] 134 dec rcx 135%endm 136 137SECTION .text 138 139global sym(aom_filter_block1d4_v2_sse2) PRIVATE 140sym(aom_filter_block1d4_v2_sse2): 141 push rbp 142 mov rbp, rsp 143 SHADOW_ARGS_TO_STACK 6 144 push rsi 145 push rdi 146 ; end prolog 147 148 GET_PARAM_4 149.loop: 150 movd xmm0, [rsi] ;load src 151 movd xmm1, [rsi + rax] 152 153 APPLY_FILTER_4 0 154 jnz .loop 155 156 ; begin epilog 157 pop rdi 158 pop rsi 159 UNSHADOW_ARGS 160 pop rbp 161 ret 162 163global sym(aom_filter_block1d8_v2_sse2) PRIVATE 164sym(aom_filter_block1d8_v2_sse2): 165 push rbp 166 mov rbp, rsp 167 SHADOW_ARGS_TO_STACK 6 168 SAVE_XMM 7 169 push rsi 170 push rdi 171 ; end prolog 172 173 GET_PARAM 174.loop: 175 movq xmm0, [rsi] ;0 176 movq xmm1, [rsi + rax] ;1 177 178 APPLY_FILTER_8 0 179 jnz .loop 180 181 ; begin epilog 182 pop rdi 183 pop rsi 184 RESTORE_XMM 185 UNSHADOW_ARGS 186 pop rbp 187 ret 188 189global sym(aom_filter_block1d16_v2_sse2) PRIVATE 190sym(aom_filter_block1d16_v2_sse2): 191 push rbp 192 mov rbp, rsp 193 SHADOW_ARGS_TO_STACK 6 194 SAVE_XMM 7 195 push rsi 196 push rdi 197 ; end prolog 198 199 GET_PARAM 200.loop: 201 movdqu xmm0, [rsi] ;0 202 movdqu xmm1, [rsi + rax] ;1 203 movdqa xmm2, xmm0 204 movdqa xmm3, xmm1 205 206 APPLY_FILTER_16 0 207 jnz .loop 208 209 ; begin epilog 210 pop rdi 211 pop rsi 212 RESTORE_XMM 213 UNSHADOW_ARGS 214 pop rbp 215 ret 216 217global sym(aom_filter_block1d4_h2_sse2) PRIVATE 218sym(aom_filter_block1d4_h2_sse2): 219 push rbp 220 mov rbp, rsp 221 SHADOW_ARGS_TO_STACK 6 222 push rsi 223 push rdi 224 ; end prolog 225 226 GET_PARAM_4 227.loop: 228 movdqu xmm0, [rsi] ;load src 229 movdqa xmm1, xmm0 230 psrldq xmm1, 1 231 232 APPLY_FILTER_4 0 233 jnz .loop 234 235 ; begin epilog 236 pop rdi 237 pop rsi 238 UNSHADOW_ARGS 239 pop rbp 240 ret 241 242global sym(aom_filter_block1d8_h2_sse2) PRIVATE 243sym(aom_filter_block1d8_h2_sse2): 244 push rbp 245 mov rbp, rsp 246 SHADOW_ARGS_TO_STACK 6 247 SAVE_XMM 7 248 push rsi 249 push rdi 250 ; end prolog 251 252 GET_PARAM 253.loop: 254 movdqu xmm0, [rsi] ;load src 255 movdqa xmm1, xmm0 256 psrldq xmm1, 1 257 258 APPLY_FILTER_8 0 259 jnz .loop 260 261 ; begin epilog 262 pop rdi 263 pop rsi 264 RESTORE_XMM 265 UNSHADOW_ARGS 266 pop rbp 267 ret 268 269global sym(aom_filter_block1d16_h2_sse2) PRIVATE 270sym(aom_filter_block1d16_h2_sse2): 271 push rbp 272 mov rbp, rsp 273 SHADOW_ARGS_TO_STACK 6 274 SAVE_XMM 7 275 push rsi 276 push rdi 277 ; end prolog 278 279 GET_PARAM 280.loop: 281 movdqu xmm0, [rsi] ;load src 282 movdqu xmm1, [rsi + 1] 283 movdqa xmm2, xmm0 284 movdqa xmm3, xmm1 285 286 APPLY_FILTER_16 0 287 jnz .loop 288 289 ; begin epilog 290 pop rdi 291 pop rsi 292 RESTORE_XMM 293 UNSHADOW_ARGS 294 pop rbp 295 ret 296