1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%define BLOCK_HEIGHT_WIDTH 4 15%define vp8_filter_weight 128 16%define VP8_FILTER_SHIFT 7 17 18SECTION .text 19 20;void vp8_filter_block1d_h6_mmx 21;( 22; unsigned char *src_ptr, 23; unsigned short *output_ptr, 24; unsigned int src_pixels_per_line, 25; unsigned int pixel_step, 26; unsigned int output_height, 27; unsigned int output_width, 28; short * vp8_filter 29;) 30globalsym(vp8_filter_block1d_h6_mmx) 31sym(vp8_filter_block1d_h6_mmx): 32 push rbp 33 mov rbp, rsp 34 SHADOW_ARGS_TO_STACK 7 35 GET_GOT rbx 36 push rsi 37 push rdi 38 ; end prolog 39 40 mov rdx, arg(6) ;vp8_filter 41 42 movq mm1, [rdx + 16] ; do both the negative taps first!!! 43 movq mm2, [rdx + 32] ; 44 movq mm6, [rdx + 48] ; 45 movq mm7, [rdx + 64] ; 46 47 mov rdi, arg(1) ;output_ptr 48 mov rsi, arg(0) ;src_ptr 49 movsxd rcx, dword ptr arg(4) ;output_height 50 movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? 51 pxor mm0, mm0 ; mm0 = 00000000 52 53.nextrow: 54 movq mm3, [rsi-2] ; mm3 = p-2..p5 55 movq mm4, mm3 ; mm4 = p-2..p5 56 psrlq mm3, 8 ; mm3 = p-1..p5 57 punpcklbw mm3, mm0 ; mm3 = p-1..p2 58 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 59 60 movq mm5, mm4 ; mm5 = p-2..p5 61 punpckhbw mm4, mm0 ; mm5 = p2..p5 62 pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers 63 paddsw mm3, mm4 ; mm3 += mm5 64 65 movq mm4, mm5 ; mm4 = p-2..p5; 66 psrlq mm5, 16 ; mm5 = p0..p5; 67 punpcklbw mm5, mm0 ; mm5 = p0..p3 68 pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers 69 paddsw mm3, mm5 ; mm3 += mm5 70 71 movq mm5, mm4 ; mm5 = p-2..p5 72 psrlq mm4, 24 ; mm4 = p1..p5 73 punpcklbw mm4, mm0 ; mm4 = p1..p4 74 pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers 75 paddsw mm3, mm4 ; mm3 += mm5 76 77 ; do outer positive taps 78 movd mm4, [rsi+3] 79 punpcklbw mm4, mm0 ; mm5 = p3..p6 80 pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers 81 paddsw mm3, mm4 ; mm3 += mm5 82 83 punpcklbw mm5, mm0 ; mm5 = p-2..p1 84 pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers 85 paddsw mm3, mm5 ; mm3 += mm5 86 87 paddsw mm3, [GLOBAL(rd)] ; mm3 += round value 88 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 89 packuswb mm3, mm0 ; pack and unpack to saturate 90 punpcklbw mm3, mm0 ; 91 92 movq [rdi], mm3 ; store the results in the destination 93 94%if ABI_IS_32BIT 95 add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line 96 add rdi, rax; 97%else 98 movsxd r8, dword ptr arg(2) ;src_pixels_per_line 99 add rdi, rax; 100 101 add rsi, r8 ; next line 102%endif 103 104 dec rcx ; decrement count 105 jnz .nextrow ; next row 106 107 ; begin epilog 108 pop rdi 109 pop rsi 110 RESTORE_GOT 111 UNSHADOW_ARGS 112 pop rbp 113 ret 114 115 116;void vp8_filter_block1dc_v6_mmx 117;( 118; short *src_ptr, 119; unsigned char *output_ptr, 120; int output_pitch, 121; unsigned int pixels_per_line, 122; unsigned int pixel_step, 123; unsigned int output_height, 124; unsigned int output_width, 125; short * vp8_filter 126;) 127globalsym(vp8_filter_block1dc_v6_mmx) 128sym(vp8_filter_block1dc_v6_mmx): 129 push rbp 130 mov rbp, rsp 131 SHADOW_ARGS_TO_STACK 8 132 GET_GOT rbx 133 push rsi 134 push rdi 135 ; end prolog 136 137 movq mm5, [GLOBAL(rd)] 138 push rbx 139 mov rbx, arg(7) ;vp8_filter 140 movq mm1, [rbx + 16] ; do both the negative taps first!!! 141 movq mm2, [rbx + 32] ; 142 movq mm6, [rbx + 48] ; 143 movq mm7, [rbx + 64] ; 144 145 movsxd rdx, dword ptr arg(3) ;pixels_per_line 146 mov rdi, arg(1) ;output_ptr 147 mov rsi, arg(0) ;src_ptr 148 sub rsi, rdx 149 sub rsi, rdx 150 movsxd rcx, DWORD PTR arg(5) ;output_height 151 movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? 152 pxor mm0, mm0 ; mm0 = 00000000 153 154 155.nextrow_cv: 156 movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 157 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 158 159 160 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 161 pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. 162 paddsw mm3, mm4 ; mm3 += mm4 163 164 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 165 pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. 166 paddsw mm3, mm4 ; mm3 += mm4 167 168 movq mm4, [rsi] ; mm4 = p0..p3 = row -2 169 pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. 170 paddsw mm3, mm4 ; mm3 += mm4 171 172 173 add rsi, rdx ; move source forward 1 line to avoid 3 * pitch 174 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 175 pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. 176 paddsw mm3, mm4 ; mm3 += mm4 177 178 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 179 pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. 180 paddsw mm3, mm4 ; mm3 += mm4 181 182 183 paddsw mm3, mm5 ; mm3 += round value 184 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 185 packuswb mm3, mm0 ; pack and saturate 186 187 movd [rdi],mm3 ; store the results in the destination 188 ; the subsequent iterations repeat 3 out of 4 of these reads. Since the 189 ; recon block should be in cache this shouldn't cost much. Its obviously 190 ; avoidable!!!. 191 lea rdi, [rdi+rax] ; 192 dec rcx ; decrement count 193 jnz .nextrow_cv ; next row 194 195 pop rbx 196 197 ; begin epilog 198 pop rdi 199 pop rsi 200 RESTORE_GOT 201 UNSHADOW_ARGS 202 pop rbp 203 ret 204 205 206SECTION_RODATA 207align 16 208rd: 209 times 4 dw 0x40 210 211align 16 212global HIDDEN_DATA(sym(vp8_six_tap_x86)) 213sym(vp8_six_tap_x86): 214 times 8 dw 0 215 times 8 dw 0 216 times 8 dw 128 217 times 8 dw 0 218 times 8 dw 0 219 times 8 dw 0 220 221 times 8 dw 0 222 times 8 dw -6 223 times 8 dw 123 224 times 8 dw 12 225 times 8 dw -1 226 times 8 dw 0 227 228 times 8 dw 2 229 times 8 dw -11 230 times 8 dw 108 231 times 8 dw 36 232 times 8 dw -8 233 times 8 dw 1 234 235 times 8 dw 0 236 times 8 dw -9 237 times 8 dw 93 238 times 8 dw 50 239 times 8 dw -6 240 times 8 dw 0 241 242 times 8 dw 3 243 times 8 dw -16 244 times 8 dw 77 245 times 8 dw 77 246 times 8 dw -16 247 times 8 dw 3 248 249 times 8 dw 0 250 times 8 dw -6 251 times 8 dw 50 252 times 8 dw 93 253 times 8 dw -9 254 times 8 dw 0 255 256 times 8 dw 1 257 times 8 dw -8 258 times 8 dw 36 259 times 8 dw 108 260 times 8 dw -11 261 times 8 dw 2 262 263 times 8 dw 0 264 times 8 dw -1 265 times 8 dw 12 266 times 8 dw 123 267 times 8 dw -6 268 times 8 dw 0 269 270 271