1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION .text 14 15%macro convolve_fn 1-2 16%ifidn %1, avg 17%define AUX_XMM_REGS 4 18%else 19%define AUX_XMM_REGS 0 20%endif 21%ifidn %2, highbd 22%define pavg pavgw 23cglobal %2_convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \ 24 dst, dst_stride, \ 25 f, fxo, fxs, fyo, fys, w, h, bd 26%else 27%define pavg pavgb 28cglobal convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \ 29 dst, dst_stride, \ 30 f, fxo, fxs, fyo, fys, w, h 31%endif 32 mov r4d, dword wm 33%ifidn %2, highbd 34 shl r4d, 1 35 shl src_strideq, 1 36 shl dst_strideq, 1 37%else 38 cmp r4d, 4 39 je .w4 40%endif 41 cmp r4d, 8 42 je .w8 43 cmp r4d, 16 44 je .w16 45 cmp r4d, 32 46 je .w32 47%ifidn %2, highbd 48 cmp r4d, 64 49 je .w64 50 51 mov r4d, dword hm 52.loop128: 53 movu m0, [srcq] 54 movu m1, [srcq+16] 55 movu m2, [srcq+32] 56 movu m3, [srcq+48] 57%ifidn %1, avg 58 pavg m0, [dstq] 59 pavg m1, [dstq+16] 60 pavg m2, [dstq+32] 61 pavg m3, [dstq+48] 62%endif 63 mova [dstq ], m0 64 mova [dstq+16], m1 65 mova [dstq+32], m2 66 mova [dstq+48], m3 67 movu m0, [srcq+64] 68 movu m1, [srcq+80] 69 movu m2, [srcq+96] 70 movu m3, [srcq+112] 71 add srcq, src_strideq 72%ifidn %1, avg 73 pavg m0, [dstq+64] 74 pavg m1, [dstq+80] 75 pavg m2, [dstq+96] 76 pavg m3, [dstq+112] 77%endif 78 mova [dstq+64], m0 79 mova [dstq+80], m1 80 mova [dstq+96], m2 81 mova [dstq+112], m3 82 add dstq, dst_strideq 83 dec r4d 84 jnz .loop128 85 RET 86%endif 87 88.w64: 89 mov r4d, dword hm 90.loop64: 91 movu m0, [srcq] 92 movu m1, [srcq+16] 93 movu m2, [srcq+32] 94 movu m3, [srcq+48] 95 add srcq, src_strideq 96%ifidn %1, avg 97 pavg m0, [dstq] 98 pavg m1, [dstq+16] 99 pavg m2, [dstq+32] 100 pavg m3, [dstq+48] 101%endif 102 mova [dstq ], m0 103 mova [dstq+16], m1 104 mova [dstq+32], m2 105 mova [dstq+48], m3 106 add dstq, dst_strideq 107 dec r4d 108 jnz .loop64 109 RET 110 111.w32: 112 mov r4d, dword hm 113.loop32: 114 movu m0, [srcq] 115 movu m1, [srcq+16] 116 movu m2, [srcq+src_strideq] 117 movu m3, [srcq+src_strideq+16] 118 lea srcq, [srcq+src_strideq*2] 119%ifidn %1, avg 120 pavg m0, [dstq] 121 pavg m1, [dstq +16] 122 pavg m2, [dstq+dst_strideq] 123 pavg m3, [dstq+dst_strideq+16] 124%endif 125 mova [dstq ], m0 126 mova [dstq +16], m1 127 mova [dstq+dst_strideq ], m2 128 mova [dstq+dst_strideq+16], m3 129 lea dstq, [dstq+dst_strideq*2] 130 sub r4d, 2 131 jnz .loop32 132 RET 133 134.w16: 135 mov r4d, dword hm 136 lea r5q, [src_strideq*3] 137 lea r6q, [dst_strideq*3] 138.loop16: 139 movu m0, [srcq] 140 movu m1, [srcq+src_strideq] 141 movu m2, [srcq+src_strideq*2] 142 movu m3, [srcq+r5q] 143 lea srcq, [srcq+src_strideq*4] 144%ifidn %1, avg 145 pavg m0, [dstq] 146 pavg m1, [dstq+dst_strideq] 147 pavg m2, [dstq+dst_strideq*2] 148 pavg m3, [dstq+r6q] 149%endif 150 mova [dstq ], m0 151 mova [dstq+dst_strideq ], m1 152 mova [dstq+dst_strideq*2], m2 153 mova [dstq+r6q ], m3 154 lea dstq, [dstq+dst_strideq*4] 155 sub r4d, 4 156 jnz .loop16 157 RET 158 159.w8: 160 mov r4d, dword hm 161 lea r5q, [src_strideq*3] 162 lea r6q, [dst_strideq*3] 163.loop8: 164 movh m0, [srcq] 165 movh m1, [srcq+src_strideq] 166 movh m2, [srcq+src_strideq*2] 167 movh m3, [srcq+r5q] 168 lea srcq, [srcq+src_strideq*4] 169%ifidn %1, avg 170 movh m4, [dstq] 171 movh m5, [dstq+dst_strideq] 172 movh m6, [dstq+dst_strideq*2] 173 movh m7, [dstq+r6q] 174 pavg m0, m4 175 pavg m1, m5 176 pavg m2, m6 177 pavg m3, m7 178%endif 179 movh [dstq ], m0 180 movh [dstq+dst_strideq ], m1 181 movh [dstq+dst_strideq*2], m2 182 movh [dstq+r6q ], m3 183 lea dstq, [dstq+dst_strideq*4] 184 sub r4d, 4 185 jnz .loop8 186 RET 187 188%ifnidn %2, highbd 189.w4: 190 mov r4d, dword hm 191 lea r5q, [src_strideq*3] 192 lea r6q, [dst_strideq*3] 193.loop4: 194 movd m0, [srcq] 195 movd m1, [srcq+src_strideq] 196 movd m2, [srcq+src_strideq*2] 197 movd m3, [srcq+r5q] 198 lea srcq, [srcq+src_strideq*4] 199%ifidn %1, avg 200 movd m4, [dstq] 201 movd m5, [dstq+dst_strideq] 202 movd m6, [dstq+dst_strideq*2] 203 movd m7, [dstq+r6q] 204 pavg m0, m4 205 pavg m1, m5 206 pavg m2, m6 207 pavg m3, m7 208%endif 209 movd [dstq ], m0 210 movd [dstq+dst_strideq ], m1 211 movd [dstq+dst_strideq*2], m2 212 movd [dstq+r6q ], m3 213 lea dstq, [dstq+dst_strideq*4] 214 sub r4d, 4 215 jnz .loop4 216 RET 217%endif 218%endmacro 219 220INIT_XMM sse2 221convolve_fn copy 222convolve_fn avg 223%if CONFIG_VP9_HIGHBITDEPTH 224convolve_fn copy, highbd 225convolve_fn avg, highbd 226%endif 227