1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "aom_ports/x86_abi_support.asm" 15 16%macro HIGH_GET_PARAM_4 0 17 mov rdx, arg(5) ;filter ptr 18 mov rsi, arg(0) ;src_ptr 19 mov rdi, arg(2) ;output_ptr 20 mov rcx, 0x00000040 21 22 movdqa xmm3, [rdx] ;load filters 23 pshuflw xmm4, xmm3, 11111111b ;k3 24 psrldq xmm3, 8 25 pshuflw xmm3, xmm3, 0b ;k4 26 punpcklwd xmm4, xmm3 ;k3k4 27 28 movq xmm3, rcx ;rounding 29 pshufd xmm3, xmm3, 0 30 31 mov rdx, 0x00010001 32 movsxd rcx, DWORD PTR arg(6) ;bps 33 movq xmm5, rdx 34 movq xmm2, rcx 35 pshufd xmm5, xmm5, 0b 36 movdqa xmm1, xmm5 37 psllw xmm5, xmm2 38 psubw xmm5, xmm1 ;max value (for clamping) 39 pxor xmm2, xmm2 ;min value (for clamping) 40 41 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 42 movsxd rdx, DWORD PTR arg(3) ;out_pitch 43 movsxd rcx, DWORD PTR arg(4) ;output_height 44%endm 45 46%macro HIGH_APPLY_FILTER_4 1 47 48 punpcklwd xmm0, xmm1 ;two row in one register 49 pmaddwd xmm0, xmm4 ;multiply the filter factors 50 51 paddd xmm0, xmm3 ;rounding 52 psrad xmm0, 7 ;shift 53 packssdw xmm0, xmm0 ;pack to word 54 55 ;clamp the values 56 pminsw xmm0, xmm5 57 pmaxsw xmm0, xmm2 58 59%if %1 60 movq xmm1, [rdi] 61 pavgw xmm0, xmm1 62%endif 63 64 movq [rdi], xmm0 65 lea rsi, [rsi + 2*rax] 66 lea rdi, [rdi + 2*rdx] 67 dec rcx 68%endm 69 70%macro HIGH_GET_PARAM 0 71 mov rdx, arg(5) ;filter ptr 72 mov rsi, arg(0) ;src_ptr 73 mov rdi, arg(2) ;output_ptr 74 mov rcx, 0x00000040 75 76 movdqa xmm6, [rdx] ;load filters 77 78 pshuflw xmm7, xmm6, 11111111b ;k3 79 pshufhw xmm6, xmm6, 0b ;k4 80 psrldq xmm6, 8 81 punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4 82 83 movq xmm4, rcx ;rounding 84 pshufd xmm4, xmm4, 0 85 86 mov rdx, 0x00010001 87 movsxd rcx, DWORD PTR arg(6) ;bps 88 movq xmm3, rdx 89 movq xmm5, rcx 90 pshufd xmm3, xmm3, 0b 91 movdqa xmm1, xmm3 92 psllw xmm3, xmm5 93 psubw xmm3, xmm1 ;max value (for clamping) 94 pxor xmm5, xmm5 ;min value (for clamping) 95 96 movdqa max, xmm3 97 movdqa min, xmm5 98 99 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 100 movsxd rdx, DWORD PTR arg(3) ;out_pitch 101 movsxd rcx, DWORD PTR arg(4) ;output_height 102%endm 103 104%macro HIGH_APPLY_FILTER_8 1 105 movdqa xmm6, xmm0 106 punpckhwd xmm6, xmm1 107 punpcklwd xmm0, xmm1 108 pmaddwd xmm6, xmm7 109 pmaddwd xmm0, xmm7 110 111 paddd xmm6, xmm4 ;rounding 112 paddd xmm0, xmm4 ;rounding 113 psrad xmm6, 7 ;shift 114 psrad xmm0, 7 ;shift 115 packssdw xmm0, xmm6 ;pack back to word 116 117 ;clamp the values 118 pminsw xmm0, max 119 pmaxsw xmm0, min 120 121%if %1 122 movdqu xmm1, [rdi] 123 pavgw xmm0, xmm1 124%endif 125 movdqu [rdi], xmm0 ;store the result 126 127 lea rsi, [rsi + 2*rax] 128 lea rdi, [rdi + 2*rdx] 129 dec rcx 130%endm 131 132%macro HIGH_APPLY_FILTER_16 1 133 movdqa xmm5, xmm0 134 movdqa xmm6, xmm2 135 punpckhwd xmm5, xmm1 136 punpckhwd xmm6, xmm3 137 punpcklwd xmm0, xmm1 138 punpcklwd xmm2, xmm3 139 140 pmaddwd xmm5, xmm7 141 pmaddwd xmm6, xmm7 142 pmaddwd xmm0, xmm7 143 pmaddwd xmm2, xmm7 144 145 paddd xmm5, xmm4 ;rounding 146 paddd xmm6, xmm4 147 paddd xmm0, xmm4 148 paddd xmm2, xmm4 149 150 psrad xmm5, 7 ;shift 151 psrad xmm6, 7 152 psrad xmm0, 7 153 psrad xmm2, 7 154 155 packssdw xmm0, xmm5 ;pack back to word 156 packssdw xmm2, xmm6 ;pack back to word 157 158 ;clamp the values 159 pminsw xmm0, max 160 pmaxsw xmm0, min 161 pminsw xmm2, max 162 pmaxsw xmm2, min 163 164%if %1 165 movdqu xmm1, [rdi] 166 movdqu xmm3, [rdi + 16] 167 pavgw xmm0, xmm1 168 pavgw xmm2, xmm3 169%endif 170 movdqu [rdi], xmm0 ;store the result 171 movdqu [rdi + 16], xmm2 ;store the result 172 173 lea rsi, [rsi + 2*rax] 174 lea rdi, [rdi + 2*rdx] 175 dec rcx 176%endm 177 178SECTION .text 179 180globalsym(aom_highbd_filter_block1d4_v2_sse2) 181sym(aom_highbd_filter_block1d4_v2_sse2): 182 push rbp 183 mov rbp, rsp 184 SHADOW_ARGS_TO_STACK 7 185 push rsi 186 push rdi 187 ; end prolog 188 189 HIGH_GET_PARAM_4 190.loop: 191 movq xmm0, [rsi] ;load src 192 movq xmm1, [rsi + 2*rax] 193 194 HIGH_APPLY_FILTER_4 0 195 jnz .loop 196 197 ; begin epilog 198 pop rdi 199 pop rsi 200 UNSHADOW_ARGS 201 pop rbp 202 ret 203 204globalsym(aom_highbd_filter_block1d8_v2_sse2) 205sym(aom_highbd_filter_block1d8_v2_sse2): 206 push rbp 207 mov rbp, rsp 208 SHADOW_ARGS_TO_STACK 7 209 SAVE_XMM 8 210 push rsi 211 push rdi 212 ; end prolog 213 214 ALIGN_STACK 16, rax 215 sub rsp, 16 * 2 216 %define max [rsp + 16 * 0] 217 %define min [rsp + 16 * 1] 218 219 HIGH_GET_PARAM 220.loop: 221 movdqu xmm0, [rsi] ;0 222 movdqu xmm1, [rsi + 2*rax] ;1 223 224 HIGH_APPLY_FILTER_8 0 225 jnz .loop 226 227 add rsp, 16 * 2 228 pop rsp 229 230 ; begin epilog 231 pop rdi 232 pop rsi 233 RESTORE_XMM 234 UNSHADOW_ARGS 235 pop rbp 236 ret 237 238globalsym(aom_highbd_filter_block1d16_v2_sse2) 239sym(aom_highbd_filter_block1d16_v2_sse2): 240 push rbp 241 mov rbp, rsp 242 SHADOW_ARGS_TO_STACK 7 243 SAVE_XMM 9 244 push rsi 245 push rdi 246 ; end prolog 247 248 ALIGN_STACK 16, rax 249 sub rsp, 16 * 2 250 %define max [rsp + 16 * 0] 251 %define min [rsp + 16 * 1] 252 253 HIGH_GET_PARAM 254.loop: 255 movdqu xmm0, [rsi] ;0 256 movdqu xmm2, [rsi + 16] 257 movdqu xmm1, [rsi + 2*rax] ;1 258 movdqu xmm3, [rsi + 2*rax + 16] 259 260 HIGH_APPLY_FILTER_16 0 261 jnz .loop 262 263 add rsp, 16 * 2 264 pop rsp 265 266 ; begin epilog 267 pop rdi 268 pop rsi 269 RESTORE_XMM 270 UNSHADOW_ARGS 271 pop rbp 272 ret 273 274globalsym(aom_highbd_filter_block1d4_h2_sse2) 275sym(aom_highbd_filter_block1d4_h2_sse2): 276 push rbp 277 mov rbp, rsp 278 SHADOW_ARGS_TO_STACK 7 279 push rsi 280 push rdi 281 ; end prolog 282 283 HIGH_GET_PARAM_4 284.loop: 285 movdqu xmm0, [rsi] ;load src 286 movdqa xmm1, xmm0 287 psrldq xmm1, 2 288 289 HIGH_APPLY_FILTER_4 0 290 jnz .loop 291 292 ; begin epilog 293 pop rdi 294 pop rsi 295 UNSHADOW_ARGS 296 pop rbp 297 ret 298 299globalsym(aom_highbd_filter_block1d8_h2_sse2) 300sym(aom_highbd_filter_block1d8_h2_sse2): 301 push rbp 302 mov rbp, rsp 303 SHADOW_ARGS_TO_STACK 7 304 SAVE_XMM 8 305 push rsi 306 push rdi 307 ; end prolog 308 309 ALIGN_STACK 16, rax 310 sub rsp, 16 * 2 311 %define max [rsp + 16 * 0] 312 %define min [rsp + 16 * 1] 313 314 HIGH_GET_PARAM 315.loop: 316 movdqu xmm0, [rsi] ;load src 317 movdqu xmm1, [rsi + 2] 318 319 HIGH_APPLY_FILTER_8 0 320 jnz .loop 321 322 add rsp, 16 * 2 323 pop rsp 324 325 ; begin epilog 326 pop rdi 327 pop rsi 328 RESTORE_XMM 329 UNSHADOW_ARGS 330 pop rbp 331 ret 332 333globalsym(aom_highbd_filter_block1d16_h2_sse2) 334sym(aom_highbd_filter_block1d16_h2_sse2): 335 push rbp 336 mov rbp, rsp 337 SHADOW_ARGS_TO_STACK 7 338 SAVE_XMM 9 339 push rsi 340 push rdi 341 ; end prolog 342 343 ALIGN_STACK 16, rax 344 sub rsp, 16 * 2 345 %define max [rsp + 16 * 0] 346 %define min [rsp + 16 * 1] 347 348 HIGH_GET_PARAM 349.loop: 350 movdqu xmm0, [rsi] ;load src 351 movdqu xmm1, [rsi + 2] 352 movdqu xmm2, [rsi + 16] 353 movdqu xmm3, [rsi + 18] 354 355 HIGH_APPLY_FILTER_16 0 356 jnz .loop 357 358 add rsp, 16 * 2 359 pop rsp 360 361 ; begin epilog 362 pop rdi 363 pop rsi 364 RESTORE_XMM 365 UNSHADOW_ARGS 366 pop rbp 367 ret 368