1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "aom_ports/x86_abi_support.asm" 15 16%macro GET_PARAM_4 0 17 mov rdx, arg(5) ;filter ptr 18 mov rsi, arg(0) ;src_ptr 19 mov rdi, arg(2) ;output_ptr 20 mov ecx, 0x01000100 21 22 movdqa xmm3, [rdx] ;load filters 23 psrldq xmm3, 6 24 packsswb xmm3, xmm3 25 pshuflw xmm3, xmm3, 0b ;k3_k4 26 27 movd xmm2, ecx ;rounding_shift 28 pshufd xmm2, xmm2, 0 29 30 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 31 movsxd rdx, DWORD PTR arg(3) ;out_pitch 32 movsxd rcx, DWORD PTR arg(4) ;output_height 33%endm 34 35%macro APPLY_FILTER_4 1 36 punpcklbw xmm0, xmm1 37 pmaddubsw xmm0, xmm3 38 39 pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7) 40 packuswb xmm0, xmm0 ;pack to byte 41 42%if %1 43 movd xmm1, [rdi] 44 pavgb xmm0, xmm1 45%endif 46 movd [rdi], xmm0 47 lea rsi, [rsi + rax] 48 lea rdi, [rdi + rdx] 49 dec rcx 50%endm 51 52%macro GET_PARAM 0 53 mov rdx, arg(5) ;filter ptr 54 mov rsi, arg(0) ;src_ptr 55 mov rdi, arg(2) ;output_ptr 56 mov ecx, 0x01000100 57 58 movdqa xmm7, [rdx] ;load filters 59 psrldq xmm7, 6 60 packsswb xmm7, xmm7 61 pshuflw xmm7, xmm7, 0b ;k3_k4 62 punpcklwd xmm7, xmm7 63 64 movd xmm6, ecx ;rounding_shift 65 pshufd xmm6, xmm6, 0 66 67 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 68 movsxd rdx, DWORD PTR arg(3) ;out_pitch 69 movsxd rcx, DWORD PTR arg(4) ;output_height 70%endm 71 72%macro APPLY_FILTER_8 1 73 punpcklbw xmm0, xmm1 74 pmaddubsw xmm0, xmm7 75 76 pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) 77 packuswb xmm0, xmm0 ;pack back to byte 78 79%if %1 80 movq xmm1, [rdi] 81 pavgb xmm0, xmm1 82%endif 83 movq [rdi], xmm0 ;store the result 84 85 lea rsi, [rsi + rax] 86 lea rdi, [rdi + rdx] 87 dec rcx 88%endm 89 90%macro APPLY_FILTER_16 1 91 punpcklbw xmm0, xmm1 92 punpckhbw xmm2, xmm1 93 pmaddubsw xmm0, xmm7 94 pmaddubsw xmm2, xmm7 95 96 pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) 97 pmulhrsw xmm2, xmm6 98 packuswb xmm0, xmm2 ;pack back to byte 99 100%if %1 101 movdqu xmm1, [rdi] 102 pavgb xmm0, xmm1 103%endif 104 movdqu [rdi], xmm0 ;store the result 105 106 lea rsi, [rsi + rax] 107 lea rdi, [rdi + rdx] 108 dec rcx 109%endm 110 111SECTION .text 112 113global sym(aom_filter_block1d4_v2_ssse3) PRIVATE 114sym(aom_filter_block1d4_v2_ssse3): 115 push rbp 116 mov rbp, rsp 117 SHADOW_ARGS_TO_STACK 6 118 push rsi 119 push rdi 120 ; end prolog 121 122 GET_PARAM_4 123.loop: 124 movd xmm0, [rsi] ;load src 125 movd xmm1, [rsi + rax] 126 127 APPLY_FILTER_4 0 128 jnz .loop 129 130 ; begin epilog 131 pop rdi 132 pop rsi 133 UNSHADOW_ARGS 134 pop rbp 135 ret 136 137global sym(aom_filter_block1d8_v2_ssse3) PRIVATE 138sym(aom_filter_block1d8_v2_ssse3): 139 push rbp 140 mov rbp, rsp 141 SHADOW_ARGS_TO_STACK 6 142 SAVE_XMM 7 143 push rsi 144 push rdi 145 ; end prolog 146 147 GET_PARAM 148.loop: 149 movq xmm0, [rsi] ;0 150 movq xmm1, [rsi + rax] ;1 151 152 APPLY_FILTER_8 0 153 jnz .loop 154 155 ; begin epilog 156 pop rdi 157 pop rsi 158 RESTORE_XMM 159 UNSHADOW_ARGS 160 pop rbp 161 ret 162 163global sym(aom_filter_block1d16_v2_ssse3) PRIVATE 164sym(aom_filter_block1d16_v2_ssse3): 165 push rbp 166 mov rbp, rsp 167 SHADOW_ARGS_TO_STACK 6 168 SAVE_XMM 7 169 push rsi 170 push rdi 171 ; end prolog 172 173 GET_PARAM 174.loop: 175 movdqu xmm0, [rsi] ;0 176 movdqu xmm1, [rsi + rax] ;1 177 movdqa xmm2, xmm0 178 179 APPLY_FILTER_16 0 180 jnz .loop 181 182 ; begin epilog 183 pop rdi 184 pop rsi 185 RESTORE_XMM 186 UNSHADOW_ARGS 187 pop rbp 188 ret 189 190global sym(aom_filter_block1d4_h2_ssse3) PRIVATE 191sym(aom_filter_block1d4_h2_ssse3): 192 push rbp 193 mov rbp, rsp 194 SHADOW_ARGS_TO_STACK 6 195 push rsi 196 push rdi 197 ; end prolog 198 199 GET_PARAM_4 200.loop: 201 movdqu xmm0, [rsi] ;load src 202 movdqa xmm1, xmm0 203 psrldq xmm1, 1 204 205 APPLY_FILTER_4 0 206 jnz .loop 207 208 ; begin epilog 209 pop rdi 210 pop rsi 211 UNSHADOW_ARGS 212 pop rbp 213 ret 214 215global sym(aom_filter_block1d8_h2_ssse3) PRIVATE 216sym(aom_filter_block1d8_h2_ssse3): 217 push rbp 218 mov rbp, rsp 219 SHADOW_ARGS_TO_STACK 6 220 SAVE_XMM 7 221 push rsi 222 push rdi 223 ; end prolog 224 225 GET_PARAM 226.loop: 227 movdqu xmm0, [rsi] ;load src 228 movdqa xmm1, xmm0 229 psrldq xmm1, 1 230 231 APPLY_FILTER_8 0 232 jnz .loop 233 234 ; begin epilog 235 pop rdi 236 pop rsi 237 RESTORE_XMM 238 UNSHADOW_ARGS 239 pop rbp 240 ret 241 242global sym(aom_filter_block1d16_h2_ssse3) PRIVATE 243sym(aom_filter_block1d16_h2_ssse3): 244 push rbp 245 mov rbp, rsp 246 SHADOW_ARGS_TO_STACK 6 247 SAVE_XMM 7 248 push rsi 249 push rdi 250 ; end prolog 251 252 GET_PARAM 253.loop: 254 movdqu xmm0, [rsi] ;load src 255 movdqu xmm1, [rsi + 1] 256 movdqa xmm2, xmm0 257 258 APPLY_FILTER_16 0 259 jnz .loop 260 261 ; begin epilog 262 pop rdi 263 pop rsi 264 RESTORE_XMM 265 UNSHADOW_ARGS 266 pop rbp 267 ret 268