1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION .text 14 15%macro convolve_fn 1-2 16INIT_XMM sse2 17%ifidn %2, highbd 18%define pavg pavgw 19cglobal %2_convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ 20 fx, fxs, fy, fys, w, h, bd 21%else 22%define pavg pavgb 23cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ 24 fx, fxs, fy, fys, w, h 25%endif 26 mov r4d, dword wm 27%ifidn %2, highbd 28 shl r4d, 1 29 shl srcq, 1 30 shl src_strideq, 1 31 shl dstq, 1 32 shl dst_strideq, 1 33%else 34 cmp r4d, 4 35 je .w4 36%endif 37 cmp r4d, 8 38 je .w8 39 cmp r4d, 16 40 je .w16 41 cmp r4d, 32 42 je .w32 43%ifidn %2, highbd 44 cmp r4d, 64 45 je .w64 46 47 mov r4d, dword hm 48.loop128: 49 movu m0, [srcq] 50 movu m1, [srcq+16] 51 movu m2, [srcq+32] 52 movu m3, [srcq+48] 53%ifidn %1, avg 54 pavg m0, [dstq] 55 pavg m1, [dstq+16] 56 pavg m2, [dstq+32] 57 pavg m3, [dstq+48] 58%endif 59 mova [dstq ], m0 60 mova [dstq+16], m1 61 mova [dstq+32], m2 62 mova [dstq+48], m3 63 movu m0, [srcq+64] 64 movu m1, [srcq+80] 65 movu m2, [srcq+96] 66 movu m3, [srcq+112] 67 add srcq, src_strideq 68%ifidn %1, avg 69 pavg m0, [dstq+64] 70 pavg m1, [dstq+80] 71 pavg m2, [dstq+96] 72 pavg m3, [dstq+112] 73%endif 74 mova [dstq+64], m0 75 mova [dstq+80], m1 76 mova [dstq+96], m2 77 mova [dstq+112], m3 78 add dstq, dst_strideq 79 dec r4d 80 jnz .loop128 81 RET 82%endif 83 84.w64 85 mov r4d, dword hm 86.loop64: 87 movu m0, [srcq] 88 movu m1, [srcq+16] 89 movu m2, [srcq+32] 90 movu m3, [srcq+48] 91 add srcq, src_strideq 92%ifidn %1, avg 93 pavg m0, [dstq] 94 pavg m1, [dstq+16] 95 pavg m2, [dstq+32] 96 pavg m3, [dstq+48] 97%endif 98 mova [dstq ], m0 99 mova [dstq+16], m1 100 mova [dstq+32], m2 101 mova [dstq+48], m3 102 add dstq, dst_strideq 103 dec r4d 104 jnz .loop64 105 RET 106 107.w32: 108 mov r4d, dword hm 109.loop32: 110 movu m0, [srcq] 111 movu m1, [srcq+16] 112 movu m2, [srcq+src_strideq] 113 movu m3, [srcq+src_strideq+16] 114 lea srcq, [srcq+src_strideq*2] 115%ifidn %1, avg 116 pavg m0, [dstq] 117 pavg m1, [dstq +16] 118 pavg m2, [dstq+dst_strideq] 119 pavg m3, [dstq+dst_strideq+16] 120%endif 121 mova [dstq ], m0 122 mova [dstq +16], m1 123 mova [dstq+dst_strideq ], m2 124 mova [dstq+dst_strideq+16], m3 125 lea dstq, [dstq+dst_strideq*2] 126 sub r4d, 2 127 jnz .loop32 128 RET 129 130.w16: 131 mov r4d, dword hm 132 lea r5q, [src_strideq*3] 133 lea r6q, [dst_strideq*3] 134.loop16: 135 movu m0, [srcq] 136 movu m1, [srcq+src_strideq] 137 movu m2, [srcq+src_strideq*2] 138 movu m3, [srcq+r5q] 139 lea srcq, [srcq+src_strideq*4] 140%ifidn %1, avg 141 pavg m0, [dstq] 142 pavg m1, [dstq+dst_strideq] 143 pavg m2, [dstq+dst_strideq*2] 144 pavg m3, [dstq+r6q] 145%endif 146 mova [dstq ], m0 147 mova [dstq+dst_strideq ], m1 148 mova [dstq+dst_strideq*2], m2 149 mova [dstq+r6q ], m3 150 lea dstq, [dstq+dst_strideq*4] 151 sub r4d, 4 152 jnz .loop16 153 RET 154 155INIT_MMX sse 156.w8: 157 mov r4d, dword hm 158 lea r5q, [src_strideq*3] 159 lea r6q, [dst_strideq*3] 160.loop8: 161 movu m0, [srcq] 162 movu m1, [srcq+src_strideq] 163 movu m2, [srcq+src_strideq*2] 164 movu m3, [srcq+r5q] 165 lea srcq, [srcq+src_strideq*4] 166%ifidn %1, avg 167 pavg m0, [dstq] 168 pavg m1, [dstq+dst_strideq] 169 pavg m2, [dstq+dst_strideq*2] 170 pavg m3, [dstq+r6q] 171%endif 172 mova [dstq ], m0 173 mova [dstq+dst_strideq ], m1 174 mova [dstq+dst_strideq*2], m2 175 mova [dstq+r6q ], m3 176 lea dstq, [dstq+dst_strideq*4] 177 sub r4d, 4 178 jnz .loop8 179 RET 180 181%ifnidn %2, highbd 182.w4: 183 mov r4d, dword hm 184 lea r5q, [src_strideq*3] 185 lea r6q, [dst_strideq*3] 186.loop4: 187 movh m0, [srcq] 188 movh m1, [srcq+src_strideq] 189 movh m2, [srcq+src_strideq*2] 190 movh m3, [srcq+r5q] 191 lea srcq, [srcq+src_strideq*4] 192%ifidn %1, avg 193 movh m4, [dstq] 194 movh m5, [dstq+dst_strideq] 195 movh m6, [dstq+dst_strideq*2] 196 movh m7, [dstq+r6q] 197 pavg m0, m4 198 pavg m1, m5 199 pavg m2, m6 200 pavg m3, m7 201%endif 202 movh [dstq ], m0 203 movh [dstq+dst_strideq ], m1 204 movh [dstq+dst_strideq*2], m2 205 movh [dstq+r6q ], m3 206 lea dstq, [dstq+dst_strideq*4] 207 sub r4d, 4 208 jnz .loop4 209 RET 210%endif 211%endmacro 212 213convolve_fn copy 214convolve_fn avg 215%if CONFIG_VP9_HIGHBITDEPTH 216convolve_fn copy, highbd 217convolve_fn avg, highbd 218%endif 219